diff --git a/.github/ISSUE_TEMPLATE/improve-docs.yml b/.github/ISSUE_TEMPLATE/a-improve-docs.yml similarity index 94% rename from .github/ISSUE_TEMPLATE/improve-docs.yml rename to .github/ISSUE_TEMPLATE/a-improve-docs.yml index 57dc64cc312..c9030bc227b 100644 --- a/.github/ISSUE_TEMPLATE/improve-docs.yml +++ b/.github/ISSUE_TEMPLATE/a-improve-docs.yml @@ -5,7 +5,7 @@ body: - type: markdown attributes: value: | - * You can ask questions or submit ideas for the dbt docs in [Discussions](https://github.com/dbt-labs/docs.getdbt.com/discussions) + * You can ask questions or submit ideas for the dbt docs in [Issues](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). @@ -39,4 +39,4 @@ body: label: Additional information description: Add any other context or screenshots about the feature request here. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9349000f66b..f3a3521bdec 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,5 @@ blank_issues_enabled: true contact_links: - - name: Want to see new content? Open a discussion! - url: https://github.com/dbt-labs/docs.getdbt.com/discussions/new - about: You can open a discussion to propose new content for the dbt product documentation. - name: Have questions about dbt? Join the Community! url: https://www.getdbt.com/community/join-the-community about: You can join the dbt Labs Community to ask and answer questions. diff --git a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml index f138b9e4e06..037da98dc6f 100644 --- a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml +++ b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml @@ -1,4 +1,4 @@ -name: Contribute to the dbt Developer Blog +name: Propose a dbt Developer Blog idea description: > For proposing a new post on the dbt Developer Blog. labels: ["content","developer blog"] diff --git a/.github/ISSUE_TEMPLATE/improve-the-site.yml b/.github/ISSUE_TEMPLATE/improve-the-site.yml index e0556d7374f..01ebdea711a 100644 --- a/.github/ISSUE_TEMPLATE/improve-the-site.yml +++ b/.github/ISSUE_TEMPLATE/improve-the-site.yml @@ -1,11 +1,11 @@ -name: Improve the docs.getdbt.com site -description: Make a suggestion or report a problem about the technical implementation of docs.getdbt.com. -labels: ["engineering"] +name: Report a docs.getdbt.com site issue +description: Report a problem about the technical implementation of docs.getdbt.com. +labels: ["engineering","bug"] body: - type: markdown attributes: value: | - * You can ask questions or submit ideas for the dbt docs in [Discussions](https://github.com/dbt-labs/docs.getdbt.com/discussions) + * You can ask questions or submit ideas for the dbt docs in [Issues](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). @@ -39,4 +39,4 @@ body: label: Additional information description: Any additional information, configuration, or data that might be necessary to reproduce the issue. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml b/.github/ISSUE_TEMPLATE/new-dbt-feature.yml deleted file mode 100644 index fa46a189fc4..00000000000 --- a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Start docs project for a new feature -description: For dbt PMs to add docs for their new or updated dbt product features. -labels: ["content","upcoming release"] -body: - - type: markdown - attributes: - value: | - * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). - * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). - - - type: checkboxes - id: contributions - attributes: - label: Contributions - description: This applies to new, unreleased content. - options: - - label: I am a PM or subject matter expert at dbt who is responsible for this feature. - - - type: textarea - attributes: - label: Where does this content belong? - description: | - - Give as much detail as you can to help us understand where you expect the content to live. - validations: - required: true - - - type: textarea - attributes: - label: Link to source material - description: | - Use the [source material template](https://docs.google.com/document/d/1lLWGMXJFjkY4p7r8ZKhBX73dOLmIjgXZBYq39LqmAJs/edit) to provide source material for this feature. - validations: - required: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml new file mode 100644 index 00000000000..e19accf6ebb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml @@ -0,0 +1,62 @@ +name: Add adapter to Trusted list +description: For adapter maintainers who wish to have theirs added to the list of Trusted adapters. +title: "Trust dbt-myadapter" +labels: ["adapter maintainers"] +assignees: + - dataders +body: + - type: markdown + attributes: + value: | + We're excited that you'd like to support your adapter formally as "Trusted"! This template will ensure that you are aware of the process and the guidelines. Additionally, that you can vouch that your adapter currently meets the standards of a Trusted adapter. For more information, see [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) + + - type: input + id: adapter-repo + attributes: + label: Link to adapter repo + description: Please link to the GitHub repo + validations: + required: true + + - type: input + id: contact + attributes: + label: Contact Details + description: How can we get in touch with you? + placeholder: your preferred email and/or dbt Slack handle + validations: + required: true + + - type: dropdown + id: author_type + attributes: + label: Which of these best describes you? + options: + - I am a dbt Community member + - I work for the vendor on top of which the dbt adapter functions + validations: + required: true + + - type: checkboxes + id: read-program-guide + attributes: + label: Please agree to the each of the following + options: + - label: I am a maintainer of the adapter being submited for Trusted status + required: true + - label: I have read both the [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) and [Building a Trusted Adapter](https://docs.getdbt.com/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter) pages. + required: true + - label: I believe that the adapter currently meets the expectations given above + required: true + - label: I will ensure this adapter stays in compliance with the guidelines + required: true + - label: I understand that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met + required: true + + - type: textarea + id: icon + attributes: + label: What icon should be used? + description: | + Please share an svg image that you'd like to be displayed in for your adapter. Normally, this is the logo for the data platform on top of which your adapter works. If there's a dark mode version, please also share that. + Pasting the image from your clipboard will upload the file to GitHub and create markdown formatting for it to be rendered inline diff --git a/.github/labeler.yml b/.github/labeler.yml index 176f1874009..316098eb51c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,6 +3,7 @@ developer blog: guides: - website/docs/guides/**/* +- website/docs/quickstarts/**/* content: - website/docs/**/* diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 90f4938d2cb..309872dd818 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,9 +10,10 @@ To learn more about the writing conventions used in the dbt Labs docs, see the [ -- [ ] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) and [About versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) so my content adheres to these guidelines. +- [ ] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. +- [ ] For [docs versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#about-versioning), review how to [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content). - [ ] Add a checklist item for anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." Adding new pages (delete if not applicable): @@ -22,4 +23,4 @@ Adding new pages (delete if not applicable): Removing or renaming existing pages (delete if not applicable): - [ ] Remove page from `website/sidebars.js` - [ ] Add an entry `website/static/_redirects` -- [ ] [Ran link testing](https://github.com/dbt-labs/docs.getdbt.com#running-the-cypress-tests-locally) to update the links that point to the deleted page +- [ ] Run link testing locally with `npm run build` to update the links that point to the deleted page diff --git a/.github/workflows/asana-connection.yml b/.github/workflows/asana-connection.yml new file mode 100644 index 00000000000..aced477bdac --- /dev/null +++ b/.github/workflows/asana-connection.yml @@ -0,0 +1,17 @@ +name: Show PR Status in Asana +on: + pull_request: + types: [opened, reopened] + +jobs: + create-asana-attachment-job: + runs-on: ubuntu-latest + name: Create pull request attachments on Asana tasks + steps: + - name: Create pull request attachments + uses: Asana/create-app-attachment-github-action@latest + id: postAttachment + with: + asana-secret: ${{ secrets.ASANA_SECRET }} + - name: Log output status + run: echo "Status is ${{ steps.postAttachment.outputs.status }}" diff --git a/.github/workflows/autogenerated_labeler.yml b/.github/workflows/autogenerated_labeler.yml new file mode 100644 index 00000000000..e6aab0492b8 --- /dev/null +++ b/.github/workflows/autogenerated_labeler.yml @@ -0,0 +1,40 @@ +# **what?** +# Labels issues autogenerated in dbt-core + +# **why?** +# To organize autogenerated issues from dbt-core to make it easier to find and track them. + +# **when?** +# When an issue is opened by the FishtownBuildBot + +name: Add Labels to Autogenerated Issues + +on: + issues: + types: [opened] + +jobs: + add_customized_labels: + if: github.event.issue.user.login == 'FishtownBuildBot' + permissions: + issues: write + + runs-on: ubuntu-latest + steps: + - name: "Determine appropriate labels by repo in title" + id: repo + env: + ISSUE_TITLE: ${{ github.event.issue.title }} + run: | + if [[ "$ISSUE_TITLE" == *"dbt-core"* ]]; then + echo "labels='content,improvement,dbt Core'" >> $GITHUB_OUTPUT + else + echo "labels='content,improvement,adapters'" >> $GITHUB_OUTPUT + fi + + - name: "Add Labels to autogenerated Issues" + id: add-labels + run: | + gh issue edit ${{ github.event.issue.number }} --repo ${{ github.repository }} --add-label ${{ steps.repo.outputs.labels }} + env: + GH_TOKEN: ${{ secrets.DOCS_SECRET }} diff --git a/.github/workflows/crawler.yml b/.github/workflows/crawler.yml new file mode 100644 index 00000000000..6bfce5321c5 --- /dev/null +++ b/.github/workflows/crawler.yml @@ -0,0 +1,33 @@ +name: Algolia Crawler +on: + pull_request: + types: + - closed + +jobs: + algolia_recrawl: + # Comment out the if check below if running on every merge to current branch + if: | + contains(github.event.pull_request.labels.*.name, 'trigger-crawl') + && github.event.pull_request.merged == true + name: Trigger Algolia Crawl + runs-on: ubuntu-latest + steps: + # Checkout repo + - name: Checkout Repo + uses: actions/checkout@v3 + + # Wait 8 minutes to allow Vercel build to complete + - run: sleep 480 + + # Once deploy URL is found, trigger Algolia crawl + - name: Run Algolia Crawler + uses: algolia/algoliasearch-crawler-github-actions@v1 + id: crawler_push + with: + crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} + crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} + algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} + algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} + site-url: 'https://docs.getdbt.com' + crawler-name: ${{ secrets.CRAWLER_NAME }} diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml index 5ebef4f88ca..48615e60b9e 100644 --- a/.github/workflows/label.yml +++ b/.github/workflows/label.yml @@ -2,37 +2,45 @@ name: Add/Remove Labels on: pull_request_target: - types: [ opened, closed ] + types: [opened] jobs: add_new_contributor_label: if: github.event.action == 'opened' - permissions: - contents: read - pull-requests: write runs-on: ubuntu-latest steps: - - uses: actions/github-script@v6 - with: - script: | - const creator = context.payload.sender.login + - name: Add new contributor label + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.DOCS_SECRET }} + script: | + const creator = context.payload.sender.login; const opts = github.rest.issues.listForRepo.endpoint.merge({ ...context.issue, creator, - state: 'all' - }) - const issues = await github.paginate(opts) + state: 'all', + }); + + const issues = await github.paginate(opts); + + let isAlreadyContributor = false; + for (const issue of issues) { if (issue.number === context.issue.number) { - continue + continue; } - if (issue.pull_request) { - return // creator is already a contributor + if (issue.pull_request && issue.user.login === creator) { + isAlreadyContributor = true; + break; } } - await github.rest.issues.addLabels({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - labels: ['new contributor'] - }) + + if (!isAlreadyContributor) { + console.log('Adding label: new contributor'); + await github.rest.issues.addLabels({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['new contributor'], + }); + } diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 7e4bb5c268a..cc231cdcde3 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -5,8 +5,8 @@ name: "Pull Request Labeler" on: -- pull_request_target - + pull_request_target: + types: [opened] jobs: triage: permissions: diff --git a/.gitignore b/.gitignore index b2746893814..74d338484aa 100755 --- a/.gitignore +++ b/.gitignore @@ -11,10 +11,14 @@ website/yarn.lock website/node_modules website/i18n/* -# Local vs code +# IDE configs .vscode +.idea + # Local Netlify folder .netlify -.vscode .eslintcache + +# Local Vercel folder +.vercel diff --git a/README.md b/README.md index 4dfd8a8be9e..c749fedf95a 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Creating an inclusive and equitable environment for our documents is more import We welcome contributions from community members to this repo: - **Fixes**: When you notice an error, you can use the `Edit this page` button at the bottom of each page to suggest a change. - **New documentation**: If you contributed code in [dbt-core](https://github.com/dbt-labs/dbt-core), we encourage you to also write the docs here! Please reach out in the dbt community if you need help finding a place for these docs. -- **Major rewrites**: You can [file an issue](https://github.com/dbt-labs/docs.getdbt.com/issues/new?assignees=&labels=content%2Cimprovement&template=improve-docs.yml) or [start a discussion](https://github.com/dbt-labs/docs.getdbt.com/discussions) to propose ideas for a content area that requires attention. +- **Major rewrites**: You can [file an issue](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to propose ideas for a content area that requires attention. You can use components documented in the [docusaurus library](https://v2.docusaurus.io/docs/markdown-features/). @@ -42,7 +42,7 @@ You can add code snippets and other content in a tabbed view. To learn more abou # Running the Docs site locally -You can click a link available in a netlify bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/): +You can click a link available in a Vercel bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/): ## Prerequisites diff --git a/contributing/adding-page-components.md b/contributing/adding-page-components.md index 751f7c1f6c1..a07d0ff02e4 100644 --- a/contributing/adding-page-components.md +++ b/contributing/adding-page-components.md @@ -1,6 +1,6 @@ ## Using warehouse components -You can use the following components to provide code snippets for each supported warehouse. You can see a real-life example in the docs page [Initialize your project](/quickstarts/databricks?step=6). +You can use the following components to provide code snippets for each supported warehouse. You can see a real-life example in the docs page [Initialize your project](/guides/databricks?step=6). Identify code by labeling with the warehouse names: diff --git a/contributing/content-style-guide.md b/contributing/content-style-guide.md index eaa090a00b6..0d2bf243d45 100644 --- a/contributing/content-style-guide.md +++ b/contributing/content-style-guide.md @@ -229,7 +229,7 @@ When referring to different sections of the IDE, use the name of the section and People make use of titles in many places like table headers, section headings (such as an H2, H3, or H4), page titles, sidebars, and so much more. -When generating titles or updating them, use sentence case. It sets a more conversational tone to the docs—making the content more approachable and creating a friendly feel. +When generating titles or updating them, use sentence case. It sets a more conversational tone to the docs— making the content more approachable and creating a friendly feel. We've defined five content types you can use when contributing to the docs (as in, writing or authoring). Learn more about title guidelines for [each content type](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-types.md). @@ -239,7 +239,7 @@ Placeholder text is something that the user should replace with their own text. Use all capital letters([screaming snake case](https://fission.codes/blog/screaming-snake-case/)) to indicate text that changes in the user interface or that the user needs to supply in a command or code snippet. Avoid surrounding it in brackets or braces, which someone might copy and use, producing an error. -Identify what the user should replace the placeholder text with in the paragraph preceding the code snippet or command. +Identify what the user should replace the placeholder text within the paragraph preceding the code snippet or command. :white_check_mark: The following is an example of configuring a connection to a Redshift database. In your YAML file, you must replace `CLUSTER_ID` with the ID assigned to you during setup: @@ -276,7 +276,7 @@ Guidelines for making lists are: - There are at least two items. - All list items follow a consistent, grammatical structure (like each item starts with a verb, each item begins with a capitalized word, each item is a sentence fragment). - Lists items don't end in commas, semicolons, or conjunctions (like "and", "or"). However, you can use periods if they’re complete sentences. -- Introduce the list with a heading or, if it's within text, as a complete sentence or as a sentence fragment followed by a colon. +- Introduce the list with a heading or, if it's within the text, as a complete sentence or as a sentence fragment followed by a colon. If the list starts getting lengthy and dense, consider presenting the same content in a different format such as a table, as separate subsections, or a new guide. @@ -286,7 +286,7 @@ A bulleted list with introductory text: > A dbt project is a directory of `.sql` and .yml` files. The directory must contain at a minimum: > -> - Models: A model is a single `.sql` file. Each model contains a single `select` statement that either transforms raw data into a dataset that is ready for analytics, or, more often, is an intermediate step in such a transformation. +> - Models: A model is a single `.sql` file. Each model contains a single `select` statement that either transforms raw data into a dataset that is ready for analytics or, more often, is an intermediate step in such a transformation. > - A project file: A `dbt_project.yml` file, which configures and defines your dbt project. A bulleted list with sentence fragments: @@ -307,10 +307,10 @@ A numbered list following an H2 heading: ## Tables Tables provide a great way to present complex information and can help the content be more scannable for users, too. -There are many ways to construct a table, like row spanning and cell splitting. Make sure the content is clear, concise, and presents well on the web page (like avoid awkward word wrapping). +There are many ways to construct a table, such as row spanning and cell splitting. The content should be clear, concise, and presented well on the web page (for example, avoid awkward word wrapping). Guidelines for making tables are: -- Introduce the table with a heading or, if it's within text, as a complete sentence or as a sentence fragment followed by a colon. +- Introduce the table with a heading or, if it's within the text, as a complete sentence or as a sentence fragment followed by a colon. - Use a header row - Use sentence case for all content, including the header row - Content can be complete sentences, sentence fragments, or single words (like `Currency`) @@ -338,7 +338,7 @@ A table following an H3 heading: > | Name | Description | Values | > | -----| ----------- | ------ | > | `-help` | Displays information on how to use the command. | Doesn't take any values. | -> | `-readable` | Print output in human readable format. | | +> | `-readable` | Print output in human-readable format. | | > | `-file` | Print output to file instead of stdout. | Name of the file. | ## Cards @@ -349,7 +349,7 @@ You can configure a card in 2, 3, 4, or 5-column grids. To maintain a good user There won't be many instances where you need to display 4 or 5 cards on the docs site. While we recommend you use 2 or 3-column grids, you can use 4 or 5-column grids in the following scenarios: -- For cards that contain little text and limited to under 15 words. (This is to make sure the text isn't squished) +- For cards that contain little text and are limited to 15 words or less. This is to make sure the text isn't squished. - Always have the `hide_table_of_contents:` frontmatter set to `true` (This hides the right table of contents). Otherwise, the text will appear squished and provide users with a bad experience. @@ -360,7 +360,7 @@ Otherwise, the text will appear squished and provide users with a bad experience - ``: creates 5 columns cards (use sparingly) - You can't create cards with 6 or more columns as that would provide users a poor experience. -Refer to [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) and [Quickstarts](/docs/quickstarts/overview) as examples. +Refer to [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) and [Quickstarts](/docs/guides) as examples. ### Create cards @@ -371,16 +371,16 @@ To create cards in markdown, you need to: - Add the props within the card component, including `title`,`body`,`link`,`icon`. - Close out the div by using `` -Refer to the following prop list for detailed explanation and examples: +Refer to the following prop list for detailed explanations and examples: | Prop | Type | Info | Example | | ---- | ---- | ---- | ------- | | `title` | required | The title should be clear and explain an action the user should take or a product/feature. | `title: dbt Cloud IDE` | `body` | required | The body contains the actionable or informative text for the user. You can include `` | +| `icon` | optional but recommended | You can add an icon to the card component by using any icons found in the [icons](https://github.com/dbt-labs/docs.getdbt.com/tree/current/website/static/img/icons) directory.
* Icons are added in .svg format and you must add icons in two locations: website/static/img/icons and website/static/img/icons/white. This is so users can view the icons in dark or light mode on the docs.getdbt.com site. | ` icon="pencil-paper"/>` | -The following is an example of a 4 card column: +The following is an example of a 4-card column: ```
@@ -488,9 +488,24 @@ Avoid ending a sentence with a preposition unless the rewritten sentence would s Product names, trademarks, services, and tools should be written as proper nouns, unless otherwise specified by the company or trademark owner. +As of October 2023, avoid using "dbt CLI" or "CLI" terminology when referring to the dbt Cloud CLI or dbt Core. However, if referring to the command line as a tool, CLI is acceptable. + +dbt officially provides two command line tools for running dbt commands: + +- [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) — This tool allows you to develop locally and execute dbt commands against your dbt Cloud development environment from your local command line. +- [dbt Core](https://github.com/dbt-labs/dbt-core) — This open-source tool is designed for local installation, enabling you to use dbt Core on the command line and communicate with databases through adapters. + +Here are some examples of what to use and what to avoid:
+ +✅ Set up in the dbt Cloud CLI or dbt Core
+✅ Set up in the dbt Cloud CLI or dbt Core CLI
+ +❌ Set up via dbt CLI
+❌ Set up in dbt Cloud, **or** via the CLI
+ ### Terms to use or avoid -Use industry-specific terms and research new/improved terminology. Also refer to the Inclusive Language section of this style guide for inclusive and accessible language and style. +Use industry-specific terms and research new/improved terminology. Also, refer to the Inclusive Language section of this style guide for inclusive and accessible language and style. **DO NOT** use jargon or language familiar to a small subset of readers or assume that your readers understand ALL technical terms. @@ -507,11 +522,13 @@ sign in | log in, login sign up | signup terminal | shell username | login +dbt Cloud CLI | CLI, dbt CLI +dbt Core | CLI, dbt CLI
## Links -Links embedded in documentation are about trust. Users trust that we will lead them to sites or pages related to their reading content. In order to maintain that trust, it's important that links are transparent, up-to-date, and lead to legitimate resources. +Links embedded in the documentation are about trust. Users trust that we will lead them to sites or pages related to their reading content. In order to maintain that trust, it's important that links are transparent, up-to-date, and lead to legitimate resources. ### Internal links diff --git a/contributing/single-sourcing-content.md b/contributing/single-sourcing-content.md index ca27372e5bc..7c345a6631a 100644 --- a/contributing/single-sourcing-content.md +++ b/contributing/single-sourcing-content.md @@ -15,9 +15,9 @@ Versions are managed in the `versions` array located in the `website/dbt-version ### Adding a new version -To add a new version to the site, a new object must be added to the `versions` array in the same format as existing versions. This object holds two properties: **version** and **EOLDate (See End of Life Dates below)**. +To add a new version to the site, a new object must be added to the `versions` array in the same format as existing versions. This object holds two properties: **version** and **EOLDate (See End of Life Dates below)**. -Example Version: +Example Version: ```jsx exports.versions = [ @@ -36,7 +36,7 @@ The **EOLDate** property determines when a version is no longer supported. A ver When a documentation page is viewed, the **EOLDate** property for the active version is compared to today’s date. If the current version has reached or is nearing the end of support, a banner will show atop the page, notifying the visitor of the end-of-life status. -Two different versions of the banner will show depending on the end-of-life date: +Two different versions of the banner will show depending on the end-of-life date: - When the version is within 3 months of the **EOLDate.** - When the version has passed the **EOLDate.** @@ -76,7 +76,7 @@ exports.versionedPages = [ ## Versioning blocks of content -The **VersionBlock** component provides the ability to version a specific piece of content on a docs page. +The **VersionBlock** component provides the ability to version a specific piece of content on a docs page. This component can be added directly to a markdown file in a similar way as other components (FAQ, File, Lightbox). @@ -90,7 +90,7 @@ This component can be added directly to a markdown file in a similar way as othe Both properties can be used together to set a range where the content should show. In the example below, this content will only show if the selected version is between **0.21** and **1.0**: ```markdown - + Versioned content here @@ -99,7 +99,7 @@ Both properties can be used together to set a range where the content should sho ### Example for versioning entire pages -On the [Docs Defer page](https://docs.getdbt.com/reference/node-selection/defer), tabs are used to show different versions of a piece of code. **v0.21.0 and later** shows `--select`, while **v-.20.x and earlier** changes this to `--models`. +On the [Docs Defer page](https://docs.getdbt.com/reference/node-selection/defer), tabs are used to show different versions of a piece of code. **v0.21.0 and later** shows `--select`, while **v-.20.x and earlier** changes this to `--models`. ![oldway](https://user-images.githubusercontent.com/3880403/163254165-dea23266-2eea-4e65-b3f0-c7b6d3e51fc3.png) @@ -149,7 +149,7 @@ Using a global variable requires two steps: exports.dbtVariables = { dbtCore: { name: "dbt Core" - } + } } ``` @@ -198,13 +198,13 @@ In the above example, the **dbtCloud** property has a default name of “dbt Clo ### Global variables example -The global `` component can be used inline, for example: +The global `` component can be used inline, for example: ```markdown This piece of markdown content explains why is awesome. ``` -However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. +However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. To use the component at the beginning of a sentence, add a non-breaking space character before the component: @@ -231,7 +231,7 @@ A partial file allows you to reuse content throughout the docs. Here are the ste 2. Go back to the docs file that will pull content from the partial file. 3. Add the following import file: `import ComponentName from '/snippets/_this-is-your-partial-file-name.md';` * You must always add an import file in that format. Note you can name `ComponentName` (a partial component) can be whatever makes sense for your purpose. - * `.md` needs to be added to the end of the filename. + * `.md` needs to be added to the end of the filename. 4. To use the partial component, go to the next line and add ``. This fetches the reusable content in the partial file * Note `anyname` can be whatever makes sense for your purpose. @@ -258,15 +258,15 @@ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam fermentum portti ```markdown Docs content here. -`import SetUpPages from '/snippets/_partial-name.md';` - - +import SetUpPages from '/snippets/_partial-name.md'; + + Docs content here. ``` - `import SetUpPages from '/snippets/_partial-name.md';` — A partial file that will be imported by other files -- `` — A component that imports content from the partial file. You can also use it to pass in data into the partial using props (See 'How to use props to pass different content on multiple pages?' below). +- `` — A component that imports content from the partial file. You can also use it to pass in data into the partial using props (See 'How to use props to pass different content on multiple pages?' below). 4. This will then render the content of the docs in the partial file. @@ -276,32 +276,32 @@ Docs content here.
How to use props to pass different content on multiple pages?
- + You can add props on the component only if you want to pass in data from the component into the partial file. This is useful for using the same partial component on multiple docs pages and displaying different values for each. For example, if we wanted to use a partial on multiple pages and pass in a different 'feature' for each docs page, you can write it as: -``` +```markdown import SetUpPages from '/snippets/_available-enterprise-only.md'; - -` + + ``` - + Then in the `/snippets/_available-enterprise-only.md file`, you can display that feature prop with: - + >This feature: `{props.feature}` other content etc... This will then translate to: - + >This feature: A really cool feature other content etc... In this example, the component ` ### Snippets -The Snippet component allows for content to be reusable throughout the Docs. This is very similar to the existing FAQ component. Using partial files, which is a built-in Docusaurus feature, is recommended over snippets. +The Snippet component allows for content to be reusable throughout the Docs. This is very similar to the existing FAQ component. Using partial files, which is a built-in Docusaurus feature, is recommended over snippets. Creating and using a snippet requires two steps: diff --git a/netlify.toml b/netlify.toml deleted file mode 100644 index 6ab92757410..00000000000 --- a/netlify.toml +++ /dev/null @@ -1,2 +0,0 @@ -[build] - functions = "functions" diff --git a/website/.gitignore b/website/.gitignore index ee62cc96f39..9d56e23a488 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -26,4 +26,7 @@ yarn-error.log* # feeds /static/feeds/atom.xml /static/feeds/rss.json -/static/feeds/rss.xml \ No newline at end of file +/static/feeds/rss.xml + +# Local Vercel folder +.vercel diff --git a/website/api/get-discourse-comments.js b/website/api/get-discourse-comments.js new file mode 100644 index 00000000000..5ac59cfe5f2 --- /dev/null +++ b/website/api/get-discourse-comments.js @@ -0,0 +1,169 @@ +const axios = require('axios') +require("dotenv").config(); + +const { DISCOURSE_DEVBLOG_API_KEY , DISCOURSE_USER_SYSTEM } = process.env +const DEVBLOG_PROD_URL = 'https://docs.getdbt.com/blog/' +const DEV_ENV = 'dev-' +const PREVIEW_ENV = 'deploy-preview-' + +// Set API endpoint and headers +let discourse_endpoint = `https://discourse.getdbt.com` +let headers = { + 'Accept': 'application/json', + 'Api-Key': DISCOURSE_DEVBLOG_API_KEY, + 'Api-Username': DISCOURSE_USER_SYSTEM, +} + +async function getDiscourseComments(request, response) { + let topicId, comments, DISCOURSE_TOPIC_ID; + + const blogUrl = await getBlogUrl(request) + + if (blogUrl === DEVBLOG_PROD_URL) { + DISCOURSE_TOPIC_ID = 21 + } else { + DISCOURSE_TOPIC_ID = 2 + } + + try { + const env = + blogUrl === DEVBLOG_PROD_URL + ? "" + : blogUrl.includes("localhost") + ? DEV_ENV + : PREVIEW_ENV; + const postTitle = `${env}${request.query.title}`; + const postSlug = request.query.slug; + const cleanSlug = cleanUrl(request.query.slug); + const externalId = truncateString(`${env}${cleanSlug}`); + + console.table({ + blogUrl, + postTitle, + postSlug, + cleanSlug, + externalId, + }); + + + if (!postSlug) throw new Error("Unable to query Discourse API. Error reading slug."); + + topicId = await searchDiscourseExternalId(externalId); + + // First check if the dev blog post exists in Discourse + // Get the comments if it does + if (typeof topicId === "number") { + comments = await getDiscourseTopicbyID(topicId); + } else { + // If the dev blog post does not exist in Discourse + // Create a new topic and get the comments + topicId = await createDiscourseTopic(postTitle, externalId, cleanSlug, blogUrl, DISCOURSE_TOPIC_ID); + if (typeof topicId === "number") { + comments = await getDiscourseTopicbyID(topicId); + comments.shift(); + comments = { topicId, comments }; + + return await response.status(200).json(comments); + } else { + console.log("Unable to create Discourse topic TopicID is not a number."); + return await response.status(500).json({ error: "Unable to create Discourse topic TopicID is not a number." }); + } + } + + comments.shift(); + comments = { topicId, comments }; + + return await response.status(200).json(comments); + } catch (err) { + console.log("err on getDiscourseComments", err); + return await response.status(500).json({ error: "Unable to get topics from Discourse." }); + } +} + +async function createDiscourseTopic(title, externalId, slug, blogUrl, DISCOURSE_TOPIC_ID) { + console.log(`Creating a new topic in Discourse - ${title}`) + try { + const response = await axios.post(`${discourse_endpoint}/posts`, { + title: title, + raw: `This is a companion discussion topic for the original entry at ${blogUrl}${slug}`, + category: DISCOURSE_TOPIC_ID, + embed_url: `${blogUrl}${slug}`, + external_id: externalId, + tags: ['devblog'], + visible: false + }, { headers }) + + let topicId = await response.data.topic_id + + console.log('Topic successfully created with topic_id', topicId) + + return topicId + + } catch(err) { + console.log('err on createDiscourseTopic', err) + return err + } +} + +async function getDiscourseTopicbyID(topicId) { + console.log(`Topic found setting topic id - ${topicId}`) + try { + let response = await axios.get(`${discourse_endpoint}/t/${topicId}.json`, { headers }) + let { data } = await response + let post_stream = data.post_stream + let post_count = data.posts_count + + // If there is more than one comment make the topic visibile in Discourse + if (post_count > 1 && data.visible === false) { + console.log(`Topic has more than one comment. Changing visibility to visible.`) + await axios.put(`${discourse_endpoint}/t/${topicId}`, { + visible: true + }, { headers }) + } + + // Filter only 'regular' posts in Discourse. (e.g. not moderator actions, small_actions, whispers) + post_stream.posts = post_stream.posts.filter(post => post.post_type === 1) + + return post_stream.posts + } catch(err) { + console.log('err on getDiscourseTopicbyID', err) + return err + } +} + +async function searchDiscourseExternalId(externalId) { + console.log(`Searching for external_id in Discourse - ${externalId}`); + try { + const data = await axios.get(`${discourse_endpoint}/t/external_id/${externalId}.json`, { headers }); + return data.data.id; + } catch (err) { + if (err.response.status === 404) { + console.log("No topics found in Discourse."); + return null; + } + console.log("Unable to search Discourse for external_id.", err); + return err; + } +} + + +// Truncate external_id to 50 characters per Discourse API requirements +function truncateString(str) { + if (str.length <= 50) { + return str + } + return str.slice(0, 50) +} + +// Remove query params and hash from URL to prevent duplicate topics +function cleanUrl(url) { + return url.split("?")[0].split("#")[0]; +} + +// Create a function to get the host name from the request and add /blog/ to the end +async function getBlogUrl(req) { + const host = req.headers.host + return `https://${host}/blog/` +} + +module.exports = getDiscourseComments; diff --git a/website/api/get-discourse-topics.js b/website/api/get-discourse-topics.js new file mode 100644 index 00000000000..90d6e5af80e --- /dev/null +++ b/website/api/get-discourse-topics.js @@ -0,0 +1,136 @@ +const axios = require('axios') + +async function getDiscourseTopics(request, response) { + const { DISCOURSE_API_KEY , DISCOURSE_USER } = process.env + + const body = request.body + + try { + // Set API endpoint and headers + let discourse_endpoint = `https://discourse.getdbt.com` + let headers = { + 'Accept': 'application/json', + 'Api-Key': DISCOURSE_API_KEY, + 'Api-Username': DISCOURSE_USER, + } + + const query = buildQueryString(body) + if(!query) throw new Error('Unable to build query string.') + + // Get topics from Discourse + let { data: { posts, topics } } = await axios.get(`${discourse_endpoint}/search?q=${query}`, { headers }) + + // Return empty array if no topics found for search query + // 200 status is used to prevent triggering Datadog alerts + if(!topics || topics?.length <= 0) { + // Log message with encoded query and end function + console.log('Unable to get results from api request.') + console.log(`Search query: ${query}`) + return await response.status(200).json([]) + } + + // Set author and like_count for topics if not querying by specific term + let allTopics = topics + if(!body?.term) { + allTopics = topics.reduce((topicsArr, topic) => { + // Get first post in topic + const firstTopicPost = posts?.find(post => + post?.post_number === 1 && + post?.topic_id === topic?.id + ) + // If post found + // Get username + if(firstTopicPost?.username) { + topic.author = firstTopicPost.username + } + // Get like count + if(firstTopicPost?.like_count) { + topic.like_count = firstTopicPost.like_count + } + + if(firstTopicPost?.blurb) { + topic.blurb = firstTopicPost.blurb + } + + // Push updated topic to array + topicsArr.push(topic) + + return topicsArr + }, []) + } + + // Return topics + //return await returnResponse(200, allTopics) + return await response.status(200).json(allTopics) + } catch(err) { + // Log and return the error + console.log('err', err) + return await response.status(500).json({ error: 'Unable to get topics from Discourse.'}) + } +} + +function buildQueryString(body) { + if(!body) return null + + // start with empty query string + let query = '' + + // check param and apply to query if set + for (const [key, value] of Object.entries(body)) { + // validate categories + // if valid, add to query string + if(validateItem({ key, value })) { + if(key === 'category') { + query += `#${value} ` + } else if(key === 'inString') { + query += `in:${value}` + } else if(key === 'status' && Array.isArray(value)) { + value?.map(item => { + query += `${key}:${item} ` + }) + } else { + query += `${key}:${value} ` + } + } + } + + if(query) { + const encodedQuery = encodeURIComponent(query) + return encodedQuery + } +} + +function validateItem({ key, value }) { + // predefined Discourse values + // https://docs.discourse.org/#tag/Search/operation/search + const inStringValues = ['title', 'first', 'pinned', 'wiki'] + const orderValues = ['latest', 'likes', 'views', 'latest_topic'] + const statusValues = ['open', 'closed', 'public', 'archived', 'noreplies', 'single_user', 'solved', 'unsolved'] + + // validate keys + if(key === 'inString') { + return inStringValues.includes(value) + ? true + : false + } else if(key === 'order') { + return orderValues.includes(value) + ? true + : false + } else if(key === 'status') { + if(Array.isArray(value)) { + let isValid = true + value?.map(item => { + if(!statusValues.includes(item)) isValid = false + }) + return isValid + } else { + return statusValues.includes(value) + ? true + : false + } + } else { + return true + } +} + +module.exports = getDiscourseTopics diff --git a/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md b/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md index 944d6fdd3f9..cdfd4da5f5d 100644 --- a/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md +++ b/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md @@ -13,6 +13,13 @@ date: 2020-07-01 is_featured: false --- +:::caution More up-to-date information available + +Since this blog post was first published, many data platforms have added support for [materialized views](/blog/announcing-materialized-views), which are a superior way to achieve the goals outlined here. We recommend them over the below approach. + + +::: + Before I dive into how to create this, I have to say this. **You probably don’t need this**. I, along with my other Fishtown colleagues, have spent countless hours working with clients that ask for near-real-time streaming data. However, when we start digging into the project, it is often realized that the use case is not there. There are a variety of reasons why near real-time streaming is not a good fit. Two key ones are: 1. The source data isn’t updating frequently enough. diff --git a/website/blog/2021-02-05-dbt-project-checklist.md b/website/blog/2021-02-05-dbt-project-checklist.md index dbe2c10f408..9820c279b0f 100644 --- a/website/blog/2021-02-05-dbt-project-checklist.md +++ b/website/blog/2021-02-05-dbt-project-checklist.md @@ -139,7 +139,7 @@ This post is the checklist I created to guide our internal work, and I’m shari * [Sources](/docs/build/sources/) * [Refs](/reference/dbt-jinja-functions/ref/) * [tags](/reference/resource-configs/tags/) -* [Jinja docs](/guides/advanced/using-jinja) +* [Jinja docs](/guides/using-jinja) ## ✅ Testing & Continuous Integration ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- @@ -156,7 +156,7 @@ This post is the checklist I created to guide our internal work, and I’m shari **Useful links** -* [Version control](/guides/legacy/best-practices#version-control-your-dbt-project) +* [Version control](/best-practices/best-practice-workflows#version-control-your-dbt-project) * [dbt Labs' PR Template](/blog/analytics-pull-request-template) ## ✅ Documentation @@ -252,7 +252,7 @@ Thanks to Christine Berger for her DAG diagrams! **Useful links** -* [How we structure our dbt Project](/guides/best-practices/how-we-structure/1-guide-overview) +* [How we structure our dbt Project](/best-practices/how-we-structure/1-guide-overview) * [Coalesce DAG Audit Talk](https://www.youtube.com/watch?v=5W6VrnHVkCA&t=2s) * [Modular Data Modeling Technique](https://getdbt.com/analytics-engineering/modular-data-modeling-technique/) * [Understanding Threads](/docs/running-a-dbt-project/using-threads) diff --git a/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md b/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md index 50d09625436..8a986a12f27 100644 --- a/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md +++ b/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md @@ -159,4 +159,4 @@ All of the above configurations “work”. And as detailed, they each solve for 2. Figure out what may be a pain point in the future and try to plan for it from the beginning. 3. Don’t over-complicate things until you have the right reason. As I said in my Coalesce talk: **don’t drag your skeletons from one closet to another** 💀! -**Note:** Our attempt in writing guides like this and [How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) aren’t to try to convince you that our way is right; it is to hopefully save you the hundreds of hours it has taken us to form those opinions! +**Note:** Our attempt in writing guides like this and [How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) aren’t to try to convince you that our way is right; it is to hopefully save you the hundreds of hours it has taken us to form those opinions! diff --git a/website/blog/2021-09-15-september-21-product-email.md b/website/blog/2021-09-15-september-21-product-email.md index c18f59a9be5..a3c9993befa 100644 --- a/website/blog/2021-09-15-september-21-product-email.md +++ b/website/blog/2021-09-15-september-21-product-email.md @@ -4,7 +4,6 @@ description: "dbt v1.0 is coming up! Don't forget to update your projects to the slug: dbt-product-update-2021-september authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-09-15 diff --git a/website/blog/2021-10-15-october-21-product-update-email.md b/website/blog/2021-10-15-october-21-product-update-email.md index 9e58514c50e..c235e43bf43 100644 --- a/website/blog/2021-10-15-october-21-product-update-email.md +++ b/website/blog/2021-10-15-october-21-product-update-email.md @@ -4,7 +4,6 @@ description: "Stay up-to-date with the latest features in dbt. Read about our Oc slug: dbt-product-update-2021-october authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-10-15 diff --git a/website/blog/2021-11-15-november-21-product-email.md b/website/blog/2021-11-15-november-21-product-email.md index d38685aad53..dd5d2b63956 100644 --- a/website/blog/2021-11-15-november-21-product-email.md +++ b/website/blog/2021-11-15-november-21-product-email.md @@ -4,7 +4,6 @@ description: "Stay up-to-date with the latest features in dbt. Read about our No slug: dbt-product-update-2021-november authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-11-15 diff --git a/website/blog/2021-11-23-how-to-upgrade-dbt-versions.md b/website/blog/2021-11-23-how-to-upgrade-dbt-versions.md index 87b3ea7bd1e..3aa9368a2ca 100644 --- a/website/blog/2021-11-23-how-to-upgrade-dbt-versions.md +++ b/website/blog/2021-11-23-how-to-upgrade-dbt-versions.md @@ -62,7 +62,7 @@ As noted above, the project is on 0.16.0 right now. 0.17.2 is the final patch re > > Practically, it also lets you lock in "checkpoints" of known-stable setups. If you need to pause your migration work to deal with an urgent request, you can safely deploy what you've finished so far instead of having a bunch of unrelated half-finished changes. -Review the migration guides to get an initial indication of what changes you might need to make. For example, in [the migration guide for 0.17.0](/guides/migration/versions), there are several significant changes to dbt's functionality, but it's unlikely that all of them will apply to your project. We'll cover this more later. +Review the migration guides to get an initial indication of what changes you might need to make. For example, in [the migration guide for 0.17.0](/docs/dbt-versions/core-upgrade), there are several significant changes to dbt's functionality, but it's unlikely that all of them will apply to your project. We'll cover this more later. ## Step 2: `Add require-dbt-version` to your `dbt_project.yml` file. @@ -126,9 +126,9 @@ In this case, our example project probably has dbt 0.3.0 installed. By reviewing ### Step 5b. Fix errors, then warnings -Obviously, errors that stop you from running your dbt project at all are the most important to deal with. Let's assume that our project used a too-broadly-scoped variable in a macro file, support for which was removed in v0.17. The [migration guide explains what to do instead](/guides/migration/versions), and it's a pretty straightforward fix. +Obviously, errors that stop you from running your dbt project at all are the most important to deal with. Let's assume that our project used a too-broadly-scoped variable in a macro file, support for which was removed in v0.17. The [migration guide explains what to do instead](/docs/dbt-versions/core-upgrade), and it's a pretty straightforward fix. -Once your errors are out of the way, have a look at warnings. For example, 0.17 introduced `config-version: 2` to `dbt_project.yml`. Although it's backwards compatible for now, we know that support for the old version will be removed in a future version of dbt so we might as well deal with it now. Again, the migration guide explains [what we need to do](/guides/migration/versions), and how to take full advantage of the new functionality in the future. +Once your errors are out of the way, have a look at warnings. For example, 0.17 introduced `config-version: 2` to `dbt_project.yml`. Although it's backwards compatible for now, we know that support for the old version will be removed in a future version of dbt so we might as well deal with it now. Again, the migration guide explains [what we need to do](/docs/dbt-versions/core-upgrade), and how to take full advantage of the new functionality in the future. ### Stay focused @@ -156,7 +156,7 @@ Once your compilation issues are resolved, it's time to run your job for real, t After that, make sure that your CI environment in dbt Cloud or your orchestrator is on the right dbt version, then open a PR. -If you're using [Slim CI](https://docs.getdbt.com/docs/guides/best-practices#run-only-modified-models-to-test-changes-slim-ci), keep in mind that artifacts aren't necessarily compatible from one version to another, so you won't be able to use it until the job you defer to has completed a run with the upgraded dbt version. This doesn't impact our example because support for Slim CI didn't come out until 0.18.0. +If you're using [Slim CI](https://docs.getdbt.com/docs/best-practices#run-only-modified-models-to-test-changes-slim-ci), keep in mind that artifacts aren't necessarily compatible from one version to another, so you won't be able to use it until the job you defer to has completed a run with the upgraded dbt version. This doesn't impact our example because support for Slim CI didn't come out until 0.18.0. ## Step 7. Merge and communicate diff --git a/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md b/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md index c6fff54b465..8db2407afdb 100644 --- a/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md +++ b/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md @@ -26,7 +26,7 @@ So let’s all commit to sharing our hard won knowledge with each other—and in The purpose of this blog is to double down on our long running commitment to contributing to the knowledge loop. -From early posts like ‘[The Startup Founders Guide to Analytics’](https://thinkgrowth.org/the-startup-founders-guide-to-analytics-1d2176f20ac1) to foundational guides like [‘How We Structure Our dbt Projects](/guides/best-practices/how-we-structure/1-guide-overview)’, we’ve had a long standing goal of working with the community to create practical, hands-on tutorials and guides which distill the knowledge we’ve been able to collectively gather. +From early posts like ‘[The Startup Founders Guide to Analytics’](https://thinkgrowth.org/the-startup-founders-guide-to-analytics-1d2176f20ac1) to foundational guides like [‘How We Structure Our dbt Projects](/best-practices/how-we-structure/1-guide-overview)’, we’ve had a long standing goal of working with the community to create practical, hands-on tutorials and guides which distill the knowledge we’ve been able to collectively gather. dbt as a product is based around the philosophy that even the most complicated problems can be broken down into modular, reusable components, then mixed and matched to create something novel. diff --git a/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md b/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md index 0a2ec874a22..b179c0f5c7c 100644 --- a/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md +++ b/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md @@ -91,7 +91,7 @@ The common skills needed for implementing any flavor of dbt (Core or Cloud) are: * SQL: ‘nuff said * YAML: required to generate config files for [writing tests on data models](/docs/build/tests) -* [Jinja](/guides/advanced/using-jinja): allows you to write DRY code (using [macros](/docs/build/jinja-macros), for loops, if statements, etc) +* [Jinja](/guides/using-jinja): allows you to write DRY code (using [macros](/docs/build/jinja-macros), for loops, if statements, etc) YAML + Jinja can be learned pretty quickly, but SQL is the non-negotiable you’ll need to get started. @@ -144,22 +144,22 @@ An analyst will be in the dark when attempting to debug this, and will need to r This can be perfectly ok, in the event your data team is structured for data engineers to exclusively own dbt modeling duties, but that’s a quite uncommon org structure pattern from what I’ve seen. And if you have easy solutions for this analyst-blindness problem, I’d love to hear them. Once the data has been ingested, dbt Core can be used to model it for consumption. Most of the time, users choose to either: -Use the dbt CLI+ [BashOperator](https://registry.astronomer.io/providers/apache-airflow/modules/bashoperator) with Airflow (If you take this route, you can use an external secrets manager to manage credentials externally), or +Use the dbt Core CLI+ [BashOperator](https://registry.astronomer.io/providers/apache-airflow/modules/bashoperator) with Airflow (If you take this route, you can use an external secrets manager to manage credentials externally), or Use the [KubernetesPodOperator](https://registry.astronomer.io/providers/kubernetes/modules/kubernetespodoperator) for each dbt job, as data teams have at places like [Gitlab](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_trusted_data.py#L72) and [Snowflake](https://www.snowflake.com/blog/migrating-airflow-from-amazon-ec2-to-kubernetes/). Both approaches are equally valid; the right one will depend on the team and use case at hand. | | Dependency management | Overhead | Flexibility | Infrastructure Overhead | |---|---|---|---|---| -| dbt CLI + BashOperator | Medium | Low | Medium | Low | +| dbt Core CLI + BashOperator | Medium | Low | Medium | Low | | Kubernetes Pod Operator | Very Easy | Medium | High | Medium | | | | | | | If you have DevOps resources available to you, and your team is comfortable with concepts like Kubernetes pods and containers, you can use the KubernetesPodOperator to run each job in a Docker image so that you never have to think about Python dependencies. Furthermore, you’ll create a library of images containing your dbt models that can be run on any containerized environment. However, setting up development environments, CI/CD, and managing the arrays of containers can mean a lot of overhead for some teams. Tools like the [astro-cli](https://github.com/astronomer/astro-cli) can make this easier, but at the end of the day, there’s no getting around the need for Kubernetes resources for the Gitlab approach. -If you’re just looking to get started or just don’t want to deal with containers, using the BashOperator to call the dbt CLI can be a great way to begin scheduling your dbt workloads with Airflow. +If you’re just looking to get started or just don’t want to deal with containers, using the BashOperator to call the dbt Core CLI can be a great way to begin scheduling your dbt workloads with Airflow. -It’s important to note that whichever approach you choose, this is just a first step; your actual production needs may have more requirements. If you need granularity and dependencies between your dbt models, like the team at [Updater does, you may need to deconstruct the entire dbt DAG in Airflow.](https://www.astronomer.io/guides/airflow-dbt#use-case-2-dbt-airflow-at-the-model-level) If you’re okay managing some extra dependencies, but want to maximize control over what abstractions you expose to your end users, you may want to use the [GoCardlessProvider](https://github.com/gocardless/airflow-dbt), which wraps the BashOperator and dbt CLI. +It’s important to note that whichever approach you choose, this is just a first step; your actual production needs may have more requirements. If you need granularity and dependencies between your dbt models, like the team at [Updater does, you may need to deconstruct the entire dbt DAG in Airflow.](https://www.astronomer.io/guides/airflow-dbt#use-case-2-dbt-airflow-at-the-model-level) If you’re okay managing some extra dependencies, but want to maximize control over what abstractions you expose to your end users, you may want to use the [GoCardlessProvider](https://github.com/gocardless/airflow-dbt), which wraps the BashOperator and dbt Core CLI. #### Rerunning jobs from failure @@ -176,7 +176,7 @@ Instead you can now use the following command: `dbt build –select result:error+ –defer –state ` … and that’s it! -You can see more examples [here](https://docs.getdbt.com/docs/guides/best-practices#run-only-modified-models-to-test-changes-slim-ci). +You can see more examples [here](https://docs.getdbt.com/docs/best-practices#run-only-modified-models-to-test-changes-slim-ci). This means that whether you’re actively developing or you simply want to rerun a scheduled job (because of, say, permission errors or timeouts in your database), you now have a unified approach to doing both. diff --git a/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md b/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md index c4de04a48c3..8ea387cf00c 100644 --- a/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md +++ b/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md @@ -69,7 +69,7 @@ In addition to learning the basic pieces of dbt, we're familiarizing ourselves w If we decide not to do this, we end up missing out on what the dbt workflow has to offer. If you want to learn more about why we think analytics engineering with dbt is the way to go, I encourage you to read the [dbt Viewpoint](/community/resources/viewpoint#analytics-is-collaborative)! -In order to learn the basics, we’re going to [port over the SQL file](/guides/migration/tools/refactoring-legacy-sql) that powers our existing "patient_claim_summary" report that we use in our KPI dashboard in parallel to our old transformation process. We’re not ripping out the old plumbing just yet. In doing so, we're going to try dbt on for size and get used to interfacing with a dbt project. +In order to learn the basics, we’re going to [port over the SQL file](/guides/refactoring-legacy-sql) that powers our existing "patient_claim_summary" report that we use in our KPI dashboard in parallel to our old transformation process. We’re not ripping out the old plumbing just yet. In doing so, we're going to try dbt on for size and get used to interfacing with a dbt project. **Project Appearance** diff --git a/website/blog/2022-02-07-customer-360-view-census-playbook.md b/website/blog/2022-02-07-customer-360-view-census-playbook.md index 01bea4b09c5..71acb32fe94 100644 --- a/website/blog/2022-02-07-customer-360-view-census-playbook.md +++ b/website/blog/2022-02-07-customer-360-view-census-playbook.md @@ -30,7 +30,7 @@ In short, a jaffle is: *See above: Tasty, tasty jaffles.* -Jaffle Shop is a demo repo referenced in [dbt’s Getting Started Guide](/quickstarts), and its jaffles hold a special place in the dbt community’s hearts, as well as on Data Twitter™. +Jaffle Shop is a demo repo referenced in [dbt’s Getting Started Guide](/guides), and its jaffles hold a special place in the dbt community’s hearts, as well as on Data Twitter™. ![jaffles on data twitter](/img/blog/2022-02-08-customer-360-view/image_1.png) diff --git a/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md b/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md index 89fcb6f5890..954d6dca3b8 100644 --- a/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md +++ b/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md @@ -114,7 +114,7 @@ In the interest of getting a proof of concept out the door (I highly favor focus - Our own Dev, Prod & Publish databases - Our own code repository which we managed independently -- dbt CLI +- dbt Core CLI - Virtual Machine running dbt on a schedule None of us had used dbt before, but we’d heard amazing things about it. We hotly debated the choice between dbt and building our own lightweight stack, and looking back now, I couldn’t be happier with choosing dbt. While there was a learning curve that slowed us down initially, we’re now seeing the benefit of that decision. Onboarding new analysts is a breeze and much of the functionality we need is pre-built. The more we use the tool, the faster we are at using it and the more value we’re gaining from the product. diff --git a/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md b/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md index 91ad1080ce6..2ee774d4f1d 100644 --- a/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md +++ b/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md @@ -12,6 +12,10 @@ date: 2022-05-03 is_featured: true --- +:::info Different from dbt Cloud CLI +This blog explains how to use the `dbt-cloud-cli` Python library to create a data catalog app with dbt Cloud artifacts. This is different from the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), a tool that allows you to run dbt commands against your dbt Cloud development environment from your local command line. +::: + dbt Cloud is a hosted service that many organizations use for their dbt deployments. Among other things, it provides an interface for creating and managing deployment jobs. When triggered (e.g., cron schedule, API trigger), the jobs generate various artifacts that contain valuable metadata related to the dbt project and the run results. dbt Cloud provides a REST API for managing jobs, run artifacts and other dbt Cloud resources. Data/analytics engineers would often write custom scripts for issuing automated calls to the API using tools [cURL](https://curl.se/) or [Python Requests](https://requests.readthedocs.io/en/latest/). In some cases, the engineers would go on and copy/rewrite them between projects that need to interact with the API. Now, they have a bunch of scripts on their hands that they need to maintain and develop further if business requirements change. If only there was a dedicated tool for interacting with the dbt Cloud API that abstracts away the complexities of the API calls behind an easy-to-use interface… Oh wait, there is: [the dbt-cloud-cli](https://github.com/data-mie/dbt-cloud-cli)! diff --git a/website/blog/2022-05-17-stakeholder-friendly-model-names.md b/website/blog/2022-05-17-stakeholder-friendly-model-names.md index 0e0ccad5c96..39107035465 100644 --- a/website/blog/2022-05-17-stakeholder-friendly-model-names.md +++ b/website/blog/2022-05-17-stakeholder-friendly-model-names.md @@ -157,7 +157,7 @@ These 3 parts go from least granular (general) to most granular (specific) so yo ### Coming up... -In this part of the series, we talked about why the model name is the center of understanding for the purpose and content within a model. In the in the upcoming ["How We Structure Our dbt Projects"](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) guide, you can explore how to use this naming pattern with more specific examples in different parts of your dbt DAG that cover regular use cases: +In this part of the series, we talked about why the model name is the center of understanding for the purpose and content within a model. In the in the upcoming ["How We Structure Our dbt Projects"](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) guide, you can explore how to use this naming pattern with more specific examples in different parts of your dbt DAG that cover regular use cases: - How would you name a model that is filtered on some columns - Do we recommend naming snapshots in a specific way diff --git a/website/blog/2022-06-30-lower-sql-function.md b/website/blog/2022-06-30-lower-sql-function.md index c50af5f3fb3..3f7cff44ccb 100644 --- a/website/blog/2022-06-30-lower-sql-function.md +++ b/website/blog/2022-06-30-lower-sql-function.md @@ -75,7 +75,7 @@ After running this query, the `customers` table will look a little something lik Now, all characters in the `first_name` and `last_name` columns are lowercase. > **Where do you lower?** -> Changing all string columns to lowercase to create uniformity across data sources typically happens in our dbt project’s [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lowercasing, should ideally happen in staging models to create downstream uniformity. It’s also more performant in downstream models that join on string values to join on strings that are of all the same casing versus having to join and perform lowercasing at the same time. +> Changing all string columns to lowercase to create uniformity across data sources typically happens in our dbt project’s [staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lowercasing, should ideally happen in staging models to create downstream uniformity. It’s also more performant in downstream models that join on string values to join on strings that are of all the same casing versus having to join and perform lowercasing at the same time. ## Why we love it diff --git a/website/blog/2022-07-19-migrating-from-stored-procs.md b/website/blog/2022-07-19-migrating-from-stored-procs.md index 691284a49e9..e2afdbfcd66 100644 --- a/website/blog/2022-07-19-migrating-from-stored-procs.md +++ b/website/blog/2022-07-19-migrating-from-stored-procs.md @@ -54,7 +54,7 @@ With dbt, we work towards creating simpler, more transparent data pipelines like ![Diagram of what data flows look like with dbt. It's easier to trace lineage in this setup.](/img/blog/2022-07-19-migrating-from-stored-procs/dbt-diagram.png) -Tight [version control integration](https://docs.getdbt.com/docs/guides/best-practices#version-control-your-dbt-project) is an added benefit of working with dbt. By leveraging the power of git-based tools, dbt enables you to integrate and test changes to transformation pipelines much faster than you can with other approaches. We often see teams who work in stored procedures making changes to their code without any notion of tracking those changes over time. While that’s more of an issue with the team’s chosen workflow than a problem with stored procedures per se, it does reflect how legacy tooling makes analytics work harder than necessary. +Tight [version control integration](https://docs.getdbt.com/docs/best-practices#version-control-your-dbt-project) is an added benefit of working with dbt. By leveraging the power of git-based tools, dbt enables you to integrate and test changes to transformation pipelines much faster than you can with other approaches. We often see teams who work in stored procedures making changes to their code without any notion of tracking those changes over time. While that’s more of an issue with the team’s chosen workflow than a problem with stored procedures per se, it does reflect how legacy tooling makes analytics work harder than necessary. ## Methodologies for migrating from stored procedures to dbt diff --git a/website/blog/2022-07-26-pre-commit-dbt.md b/website/blog/2022-07-26-pre-commit-dbt.md index e0b41d82d0c..e75bd622293 100644 --- a/website/blog/2022-07-26-pre-commit-dbt.md +++ b/website/blog/2022-07-26-pre-commit-dbt.md @@ -12,7 +12,7 @@ is_featured: true *Editor's note — since the creation of this post, the package pre-commit-dbt's ownership has moved to another team and it has been renamed to [dbt-checkpoint](https://github.com/dbt-checkpoint/dbt-checkpoint). A redirect has been set up, meaning that the code example below will still work. It is also possible to replace `repo: https://github.com/offbi/pre-commit-dbt` with `repo: https://github.com/dbt-checkpoint/dbt-checkpoint` in your `.pre-commit-config.yaml` file.* -At dbt Labs, we have [best practices](https://docs.getdbt.com/docs/guides/best-practices) we like to follow for the development of dbt projects. One of them, for example, is that all models should have at least `unique` and `not_null` tests on their primary key. But how can we enforce rules like this? +At dbt Labs, we have [best practices](https://docs.getdbt.com/docs/best-practices) we like to follow for the development of dbt projects. One of them, for example, is that all models should have at least `unique` and `not_null` tests on their primary key. But how can we enforce rules like this? That question becomes difficult to answer in large dbt projects. Developers might not follow the same conventions. They might not be aware of past decisions, and reviewing pull requests in git can become more complex. When dbt projects have hundreds of models, it's hard to know which models do not have any tests defined and aren't enforcing your conventions. @@ -112,7 +112,7 @@ The last step of our flow is to make those pre-commit checks part of the day-to- Adding periodic pre-commit checks can be done in 2 different ways, through CI (Continuous Integration) actions, or as git hooks when running dbt locally -#### a) Adding pre-commit-dbt to the CI flow (works for dbt Cloud and dbt CLI users) +#### a) Adding pre-commit-dbt to the CI flow (works for dbt Cloud and dbt Core users) The example below will assume GitHub actions as the CI engine but similar behavior could be achieved in any other CI tool. @@ -237,9 +237,9 @@ With that information, I could now go back to dbt, document my model customers a We could set up rules that prevent any change to be merged if the GitHub action fails. Alternatively, this action step can be defined as merely informational. -#### b) Installing the pre-commit git hooks (for dbt CLI users) +#### b) Installing the pre-commit git hooks (for dbt Core users) -If we develop locally with the dbt CLI, we could also execute `pre-commit install` to install the git hooks. What it means then is that every time we want to commit code in git, the pre-commit hooks will run and will prevent us from committing if any step fails. +If we develop locally with the dbt Core CLI, we could also execute `pre-commit install` to install the git hooks. What it means then is that every time we want to commit code in git, the pre-commit hooks will run and will prevent us from committing if any step fails. If we want to commit code without performing all the steps of the pre-hook we could use the environment variable SKIP or the git flag `--no-verify` as described [in the documentation](https://pre-commit.com/#temporarily-disabling-hooks). (e.g. we might want to skip the auto `dbt docs generate` locally to prevent it from running at every commit and rely on running it manually from time to time) diff --git a/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md b/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md deleted file mode 100644 index 3615a6204d6..00000000000 --- a/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md +++ /dev/null @@ -1,173 +0,0 @@ ---- -title: "Understanding the components of the dbt Semantic Layer" -description: "Heard about dbt Metrics or the dbt Semantic Layer and curious to give them a try? Callum McCann digs into what they are, walks through an example, and discusses how they all fit together!" -slug: understanding-the-components-of-the-dbt-semantic-layer - -authors: [callum_mccann] - -tags: [dbt product updates] -hide_table_of_contents: false - -date: 2022-07-27 -is_featured: true ---- - -# Getting started with the dbt Semantic Layer - -> TLDR: The Semantic Layer is made up of a combination of open-source and SaaS offerings and is going to change how your team defines and consumes metrics. - -At last year's Coalesce, Drew showed us the future[^1] - a vision of what metrics in dbt could look like. Since then, we've been getting the infrastructure in place to make that vision a reality. We wanted to share with you where we are today and how it fits into the broader picture of [where we're going](https://www.getdbt.com/blog/dbt-semantic-layer). - -To those who haven't followed this saga with the intensity of [someone watching their investments on the crypto market](https://mobile.twitter.com/scannergr1/status/1536198701215109122/photo/1), we're rolling out this new resource to help you better understand the dbt Semantic Layer and provide clarification on the following things: - -1. What is the dbt Semantic Layer? -2. How do I use it? -3. What is publicly available now? -4. What is still in development? - -With that, lets get into it! - - - -> Some of you might have been around when this was initially being referred to as the Metrics Layer. As we evaluated the long term plans for what this part of dbt was going to become, we realized that naming it the Semantic Layer better reflected its capabilities and where we plan on taking it. - -## What is the dbt Semantic Layer? - -The dbt Semantic Layer is a new part of dbt to help improve precision and consistency while expanding flexibility and capability in the modern data stack. Our maestro of metrics, Drew Banin, [released a blog post detailing the vision of where we're going here](https://www.getdbt.com/blog/dbt-semantic-layer). The first use case that we are addressing is one that most practicioners **and** stakeholders are familiar with - metrics. We'll walk through what this looks like in practice later on in this post. - -Under the hood, the dbt Semantic layer is collection of several components - some of these are part of dbt Core, some part of dbt Cloud, and some are net new functionality. They all [combine together like Voltron](https://www.youtube.com/watch?v=5rPSLQxMT8w) to create a single experience through which business users can query data in the context of the metric that is most familiar to them. And the best part is that they can do it in systems they are already comfortable using. - -***What will this look like for my data consumers and business stakeholders?*** - -Ultimately, this looks like people being able to interact with trusted datasets in the tools that they are comfortable with (and eventually new tools designed specifically around metrics). - -An example that we’ve found helpful is [ARR](https://www.zuora.com/billing-topics/annual-recurring-revenue/#:~:text=Annual%20Recurring%20Revenue%2C%20or%20ARR,for%20a%20single%20calendar%20year). A business-critical metric to SaaS companies, ARR can be a tricky calculation to keep consistent across all of the tools used in the business. With the dbt Semantic Layer, this definition would live in dbt and the logic to create the dataset for that metric would be consistent across all different consuming experiences. Best of all, definition changes would get reflected in downstream tools, so you no longer need to manually search and update every downstream dependency. Callum of 3 years ago is jumping with joy. - -***That’s good and all, but what does this look like for practitioners to use?*** - -The dbt Semantic layer is comprised of the following components[^2]: - -**Available Today** - -- **[`metric` node in dbt Core :](/docs/build/metrics)** Similar to `models` or `sources` , this is a specific node type in dbt Core. It is the definition of a time-series aggregation over a table that supports zero or more dimensions. The resulting node is stored in the `manifest.json` just like `models` and referenced in the DAG. -- **[`dbt_metrics` package:](https://github.com/dbt-labs/dbt_metrics)** this package provides macros that combine the version-controlled metric definition and query-time parameters (like dimensions, a time grain, and secondary calculations) to generate a SQL query which calculates the metric value. -- **[dbt Cloud Metadata API](https://docs.getdbt.com/docs/dbt-cloud-apis/metadata-api):** a GraphQL API which supports arbitrary queries over the metadata produced by dbt Cloud jobs. Contains metadata related to the accuracy, recency, configuration, and structure of the views and tables in the warehouse, as well as much more. - -**New** - -- **dbt Server:** this component wraps dbt Core in a persistent server that is responsible for handling RESTful API requests for dbt operations. It’s a thin interface that is primarily responsible for performance and reliability in production environments. -- **dbt Cloud proxy server:** this component enables dbt Cloud to dynamically rewrite requests to a data warehouse and compile dbt-SQL into raw SQL that the database understands. It then returns the dataset produced by the raw SQL to the platform that sent it. - -![Untitled](/img/blog/2022-07-27-getting-started-with-the-dbt-semantic-layer/semantic-layer-description.png) - -### Understanding how and when to use metrics? - -> Use of metrics and the metrics package is recommended for experienced dbt users and early adopters who want to explore this functionality. - -Let's walk through an example of how you can use the components above to get started today using our old friend - [the Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics). We'll take a look at how you can start defining and testing metrics today as well as how you'll interact with them once the new components are released. - -**When to use Metrics** - -The first question you need to ask is, *Should we be using metrics?* - -It is our belief that metrics are not a one-size fits all solution. They are designed for core business metrics where consistency and precision are of key importance, not for exploratory use cases or ad hoc analysis. Our shorthand way of determining whether the metric should be defined in dbt has been - *is this something our teams need to report on?* - -So, let’s say the CFO of our Jaffle comes to us on a Monday morning and commands the data team to overhaul how we're reporting on Revenue. Our Regional Manager Jim and Sales Director Pam[^3] have been giving him different reports! Right now its a mess of tools and inconsistencies - Jim’s numbers are defined in Tableau and say one thing, Pam’s within Hex and say another! The CFO is frustrated with it and wants a cohesive experience across the company where everyone has the same numbers for revenue. It passes the report test, it’s an important business metric; away we go! - -**Defining the Metric with Metric Node** - -In this example, we’ll say that both Jim and Pam are pulling from a table created by dbt called `orders`. It currently contains fields for `amount` and all different methods of payment_amounts, such as credit cards or gift cards. Jim has been calculating revenue by summing up the `credit_card_amount` and `gift_card_amount` fields, as he forgot to update his definition when the business added coupons and bank transfers payments. Meanwhile, Pam is correctly summing the `amount` field but hasn’t accounted for return orders that shouldn’t be counted! - -The first step is creating a unified definition for what revenue is. In order to do this, we will create the following yml definition within our dbt repo: - -```yaml -version: 2 - -metrics: - - name: revenue - label: Revenue - model: ref('orders') - description: "The total revenue of our jaffle business" - - type: sum - sql: amount - - timestamp: order_date - time_grains: [day, week, month, year] - - dimensions: - - customer_status - - has_coupon_payment - - has_bank_transfer_payment - - has_credit_card_payment - - has_gift_card_payment - - filters: - - field: status - operator: '=' - value: "'completed'" -``` - -This metric has now been defined in the dbt metadata and can be seen in the DAG! - -![Untitled](/img/blog/2022-07-27-getting-started-with-the-dbt-semantic-layer/metrics-dag.png) - -**Running The Metric Package To calculate the metric** - -In order to ensure that both Jim and Pam are retrieving the same numbers for their metric, we’ll need them to both run a metrics `calculate` query. In this example, we’re not interested in the specific payment types and only want to see revenue broken up by `week` and `customer_status`. - -```sql -select * -from {{ metrics.calculate( - metric('revenue'), - grain='week', - dimensions=['customer_status'] -) }} -``` -This would return a dataset that looks like this: - -| date_week | customer_status | revenue | -| --- | --- | --- | -| 2018-01-01 | Churn Risk | 43 | -| 2018-01-01 | Churned | 0 | -| 2018-01-01 | Healthy | 26 | -| 2018-01-08 | Churn Risk | 27 | - -Jim and Pam would then be able to reference the `revenue` column within the newly created dataset and never have to worry about the calculation of revenue ever again[^4]! The world is perfect and [balance has been restored.](https://www.youtube.com/watch?v=d1EnW4kn1kg) - -**In the near future with dbt Server** - -When dbt Server releases later this year, the flow of how metrics are consumed will change significantly. Your organization will no longer need to materialize each metric within a model in order to take advantage of the metric definition. Instead, you’ll be able to directly query dbt Server with the metric code provided and have the correct dataset returned to your BI tool of choice. - -Additionally, integration partners will have built out experiences around Metrics using the Metadata API to create unique and creative ways for consumers to obtain metric data while abstracting away complexity. For example, a box that allows the user to select from a list of metrics, time grains, dimensions, and secondary calculation and then have the correct information returned to them regardless of the selection! - -### So what is publicly available now? - -Right now, the two main open-source components that are publicly available are the [`metric` node](/docs/build/metrics) within dbt Core and the `dbt_metrics` package. Combined, these two can operate an introductory semantic layer experience by allowing analytics engineers to define metrics and then query that metric via the metrics package. - -These two components are a static experience that have to be defined in the dbt project (as the selected dimensions are defined at model creation) but are useful for those who want to ensure that metrics remain consistent across every BI tool. If you identify with any of the following conditions, you could be a good fit for implementing this as it exists today: - -- You want to prepare your organization for the full Semantic Layer launch. -- Your organization has at least a few key metrics -- Your organization uses 1 or more BI tools -- Your organization occasionally has issues around different metric calculations -- Your organization wants a centralized location for all metrics so everyone in the business knows where to look - -All of these are great reasons to begin exploring implementing metrics in your dbt project! If you’re curious about what an implementation of this might look like, we recommend referencing the [jaffle_shop_metrics](https://github.com/dbt-labs/jaffle_shop_metrics) repo! - -### What is still in development? - -Both the dbt Cloud proxy server and dbt Server are currently in development, with a scheduled release of later this year. If you’re curious about testing them once they are released, we recommend keeping an eye on our product announcements and then reaching out once they become publicly available! - -### What if I have questions? - -If you have any questions about those components, or metrics in general, please feel free to post in the #dbt-metrics-and-server channel on dbt Slack! I hang around there and am always willing to chat metrics! - -### Footnotes -[^1]: That future may not have mentioned robots but I'm holding out for [Jetson's style morning machine](https://www.youtube.com/watch?v=-0S3Jf-NxdI) to help me get ready in the morning. - -[^2]: We’re specifically calling out the licensing because there is a lot of confusion in the community around what is open-source and what isn’t. This is only becoming trickier with the introduction of the BSL licensing, which ensures users can run their own server but it cannot be sold as a cloud service. For more information on why these licensing types were picked, we recommend [Tristan’s blog around licensing dbt.](https://www.getdbt.com/blog/licensing-dbt/). The big takeaway around licensing is that you can still run components of the dbt Semantic Layer even if you aren’t a dbt Cloud customer! - -[^3]: Full transparency, I've never seen the Office. The awkward humor makes me so uncomfortable that I have to turn off the TV. Apologies if the titles of the characters are incorrect. - -[^4]: Psych! They’re definitely interested in the calculation of ARR. In fact, they don’t really trust the numbers **unless** they understand how it’s calculated. This is where they could use the Metadata API in order to query all the information about the metric, such as definition, run-time, acceptable dimensions, etc. Right now Jim and Pam would need to query the API directly but in the future we expect there to be a number of different ways to obtain this information, ranging from [direct integration with the BI tool](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration) all the way to having that information materialized in a dbt information schema! *For current tabular alternatives, there are some interesting macros in the newly released [dbt-project-evaluator package](https://github.com/dbt-labs/dbt-project-evaluator). Take a look there if you’re curious about materializing your metric information!* \ No newline at end of file diff --git a/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md b/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md index 020a48c763f..e6a8b943051 100644 --- a/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md +++ b/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md @@ -286,7 +286,7 @@ Developing an analytic code base is an ever-evolving process. What worked well w 4. **Test on representative data** - Testing on a [subset of data](https://docs.getdbt.com/guides/legacy/best-practices#limit-the-data-processed-when-in-development) is a great general practice. It allows you to iterate quickly, and doesn’t waste resources. However, there are times when you need to test on a larger dataset for problems like disk spillage to come to the fore. Testing on large data is hard and expensive, so make sure you have a good idea of the solution before you commit to this step. + Testing on a [subset of data](https://docs.getdbt.com/best-practices/best-practice-workflows#limit-the-data-processed-when-in-development) is a great general practice. It allows you to iterate quickly, and doesn’t waste resources. However, there are times when you need to test on a larger dataset for problems like disk spillage to come to the fore. Testing on large data is hard and expensive, so make sure you have a good idea of the solution before you commit to this step. 5. **Repeat** diff --git a/website/blog/2022-08-22-narrative-modeling.md b/website/blog/2022-08-22-narrative-modeling.md index a5418ccded1..a74c73fdbd1 100644 --- a/website/blog/2022-08-22-narrative-modeling.md +++ b/website/blog/2022-08-22-narrative-modeling.md @@ -177,7 +177,7 @@ To that final point, if presented with the DAG from the narrative modeling appro ### Users can tie business concepts to source data -- While the schema structure above is focused on business entities, there are still ample use cases for [staging and intermediate tables](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). +- While the schema structure above is focused on business entities, there are still ample use cases for [staging and intermediate tables](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview). - After cleaning up source data with staging tables, use the same “what happened” approach to more technical events, creating a three-node dependency from `stg_snowplow_events` to `int_page_click_captured` to `user_refreshed_cart` and thus answering the question “where do we get online user behavior information?” in a quick visit to the DAG in dbt docs. # Should your team use it? diff --git a/website/blog/2022-08-31-august-product-update.md b/website/blog/2022-08-31-august-product-update.md index 143d46a37d3..bd9d8ee0b28 100644 --- a/website/blog/2022-08-31-august-product-update.md +++ b/website/blog/2022-08-31-august-product-update.md @@ -4,7 +4,6 @@ description: "Coalesce is less than 2 months away!" slug: dbt-product-update-2022-august authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2022-08-31 @@ -23,7 +22,7 @@ You’ll hear more in [Tristan’s keynote](https://coalesce.getdbt.com/agenda/k ## **What's new** -- **dbt Core v1.3 beta:** Do you use Python for analytics? The first beta prerelease of dbt Core v1.3—including support for dbt models written in Python—is [ready to explore](https://docs.getdbt.com/guides/migration/versions/upgrading-to-v1.3)! Check it out, and read more about dbt supported Python models [in our docs](/docs/build/python-models). +- **dbt Core v1.3 beta:** Do you use Python for analytics? The first beta prerelease of dbt Core v1.3—including support for dbt models written in Python—is [ready to explore](https://docs.getdbt.com/docs/dbt-versions/core-upgrade/upgrading-to-v1.3)! Check it out, and read more about dbt supported Python models [in our docs](/docs/build/python-models). - **Technology Partner Program:** We just launched our new [Technology Partner Program](https://www.getdbt.com/blog/dbt-labs-technology-partner-program/) with 40+ friends in the Modern Data Stack to provide consistent support for seamless integrations joint-users can trust. Check our new [dbt Cloud integrations page](http://www.getdbt.com/product/integrations) for what’s available today! - **Single-tenant users:** dbt Cloud v1.1.60 is now available on dbt Cloud Enterprise. diff --git a/website/blog/2022-09-08-konmari-your-query-migration.md b/website/blog/2022-09-08-konmari-your-query-migration.md index f7d7cc74ead..c1472058150 100644 --- a/website/blog/2022-09-08-konmari-your-query-migration.md +++ b/website/blog/2022-09-08-konmari-your-query-migration.md @@ -108,7 +108,7 @@ Here are a few things to look for: ## Steps 4 & 5: Tidy by category and follow the right order—upstream to downstream -We are ready to unpack our kitchen. Use your design as a guideline for [modularization](/guides/best-practices/how-we-structure/1-guide-overview). +We are ready to unpack our kitchen. Use your design as a guideline for [modularization](/best-practices/how-we-structure/1-guide-overview). - Build your staging tables first, and then your intermediate tables in your pre-planned buckets. - Important, reusable joins that are performed in the final query should be moved upstream into their own modular models, as well as any joins that are repeated in your query. diff --git a/website/blog/2022-10-12-how-to-design-and-structure-metrics.md b/website/blog/2022-10-12-how-to-design-and-structure-metrics.md deleted file mode 100644 index 4f738543dff..00000000000 --- a/website/blog/2022-10-12-how-to-design-and-structure-metrics.md +++ /dev/null @@ -1,394 +0,0 @@ ---- -title: "How to design and structure dbt metrics: Recommendations for getting started" -description: "The introduction of the dbt Semantic Layer expands what users can do with dbt but introduces a familiar questions around where logic should live. Read along as the dbt Labs team talks about best practices through the lens of two different examples!" -slug: how-to-design-and-structure-metrics - -authors: [callum_mccann] - -tags: [dbt product updates] -hide_table_of_contents: false - -date: 2022-10-12 -is_featured: true ---- - ---- - -**IMPORTANT:** This document serves as the temporary location for information on how to design and structure your metrics. It is our intention to take this content and turn it into a Guide, like [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), but we feel that codifying information in a Guide first requires that metrics be rigorously tested by the community so that best practices can arise. This document contains our early attempts to create best practices. In other words, read these as suggestions for a new paradigm and share in the community where they do (or don’t) match your experiences! You can find more information on where to do this at the end. - ---- - -## The power of a semantic layer on top of a mature data modeling framework - -As a longtime [dbt Community](https://www.getdbt.com/community/join-the-community/) member, I knew I had to get involved when I first saw the dbt Semantic Layer in the now infamous [`dbt should know about metrics` Github Issue](https://github.com/dbt-labs/dbt-core/issues/4071). It gave me a vision of a world where metrics and business logic were unified across an entire organization; a world where the data team was no longer bound to a single consuming experience and could enable their stakeholders in dozens of different ways. To me, it felt like the opportunity to contribute to the next step of what dbt could become. - -In past roles, I’ve been referred to as the `dbt zealot` and I’ll gladly own that title! It’s not a surprise - dbt was built to serve data practitioners expand the power of our work with software engineering principles. It gave us flexibility and power to serve our organizations. But I always wondered if there were more folks who could directly benefit from interacting with dbt. - -The Semantic Layer expands the reach of dbt **by coupling dbt’s mature data modeling framework with semantic definitions.** The result is a first of its kind data experience that serves both the data practitioners writing your analytics code and stakeholders who depend on it. Metrics are the first step towards this vision, allowing users to version control and centrally define their key business metrics in a single repo while also serving them to the entire business. - -However, this is still a relatively new part of the dbt toolbox and you probably have a lot of questions on how exactly you can do that. This blog contains our early best practice recommendations for metrics in two key areas: -- **Design**: What logic goes into metrics and how to use calculations, filters, dimensions, etc. -- **Structure**: Where these metrics will live in your dbt project and how to compose the files that contain your metrics - -We developed these recommendations by combining the overall philosophy of dbt, with our hands-on learning gathered during the beta period and internal testing. - - - -**Pre-reading:** We recommend reading through the [metrics documentation](/docs/build/metrics), which contains a table of all the required/optional properties. - -### When to put business logic in the semantic layer vs the modeling layer - -Our instinct when designing metrics might be to encode as much information as possible into the semantic layer. An example of this is case statements - the analytics engineer’s gut instinct might be to mimic tools of the past and provide complicated case statements for the metric `expression` property to try and capture the nuance of how it should be calculated. - -But remember - you always have the option of performing this logic _in the modeling layer_. This is the key difference between dbt and other semantic layer offerings - by sitting the semantic layer atop a mature transformation layer, you always have the option to configure and optimize your logic within your models and then _define semantic components with intentionality_. - -Getting the balance just right is a learning experience and developing community best practices and standards will take time, which is why it’s important for us to think from first principles. What should really be our goal when determining whether logic lives in a model or a metrics? - -To explore this question and begin to develop an intuition, we’ll walk through two examples of handling this divide. - -## Basic example: Revenue - -### Designing your metrics - -In this example, we’ll cover the basics of defining a metric and a fairly straightforward example of where users can draw the line between the semantic layer and the modeling layer. You should finish this section with a better understanding of dbt metrics and its relationship to the modeling layer. - -In the past, the `marts` tables were often your end stage layer before data was consumed in another tool or system. Now, the mart is the springboard for the creation of our metric. So we'll begin by looking our end-state `marts` model called `order_events` that looks something like the below table, but on the order of millions of rows instead of five. Our finance team uses the below model to better understand revenue but inconsistencies in how it's reported have led to requests that the data team centralize the definition in the dbt repo. - -| event_date | order_id | order_country | order_status | customer_id | customer_status | amount | -| --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | United States | completed | 19 | Healthy | 10 | -| 2022-10-01 | 2 | France | completed | 36 | Churn Risk | 15 | -| 2022-10-02 | 2 | France | returned | 36 | Churned | 15 | -| 2022-10-02 | 3 | Turkey | completed | 20 | Healthy | 80 | -| 2022-10-03 | 4 | Korea | completed | 14 | Churn Risk | 24 | - -### Logic in the modeling layer vs the semantic layer - -We know from our upstream dbt models that the `amount` field represents the revenue from from each order. The inconsistent reporting, however, has arisen because the correct definition of revenue only refers to orders that are completed, not returned. Some teams aren’t familiar with this additional filter and it has led to company wide misreporting. - -The solution is to use the flexibility of the dbt modeling layer to add a boolean field called `is_active_row` that shows whether or not the row in question is the most recent version. With this, we can understand and filter out duplicate rows that may be connected to the same order. - -Once we have this field, we reach a diverging path: - -- If we are not interested in seeing the history of `order_events` , we can add a `where` clause **to the model itself**. This would ensure there is only one row per order. -- If we **are** interested in seeing the history of `order_events` , we can add a `filter` to the metric definition to ensure that these duplicate order rows don’t cause us to misreport revenue - -Both of these paths ensure that only the correct orders are included in the metric calculation but one does it at the modeling layer and the other the semantic layer. There is no **best** path here - it depends on your organization's reporting and analytics needs. For this example, we’ll say that our business isn’t interested in understanding orders that have gone from completed to returned and so we’ll use option one moving forward. Now lets define the metric: - -```yaml -version: 2 -metrics: - - name: total_revenue - label: The revenue of our business - model: ref('order_events') - description: "The revenue for our business, as defined by Jerry in Finance" - - calculation_method: sum - expression: amount - - timestamp: event_date - time_grains: [day, week, month, all_time] - - dimensions: - - customer_status - - order_country - - ## We don't need this section because we chose option 1 - ## filters: - ## - field: order_status - ## operator: '=' - ## value: 'completed -``` - -Each of the properties of the above definition are defined [in the metrics documentation](/docs/build/metrics), but let’s dig into the two that might require some additional explanation. The two in question are `expression` and `dimensions`. - -In plain english, the `expression` property is the sql column (or expression) that we are applying the calculation method on. In our example above, this simplifies to `sum(amount)`. However, this doesn’t **need** to be a field in the model. It could also be a sql expression like `case when condition = true then field else 0 end` . - -And then there’s `dimensions`. - -### Choosing which dimensions to use with your metric - -The `dimensions` attribute is a bit more nuanced than the others because it involves curating the ways through which a user can interact with the metric. To that end … - -❗ **We recommend curating dimensions, not including all columns within the model. Most models contain dimensions that aren’t relevant for end-user analysis.** - -What do we mean? Well, there is a lot of nuance in what constitutes a useful or less useful dimension that is dependent on the shape of the underlying data and the ways with which the metric will be consumed. Continuing with our revenue use case, here are some examples: - -- **Useful Dimensions:** - - `customer_status`: This field is helpful to end users because it allows them to break down the revenue generated by each customer status grouping. Members of the retention team might be interested in understanding the long-term trends of revenue from the Churn Risk group so that they can better understand the impact that their retention initiatives campaigns have had. - - `order_country`: This field is useful because it allows members of the finance team to break down the accepted revenue from each country of origin so that they can better understand which countries are experiencing the highest growth. -- **Less Useful Dimensions:** - - `order_status` : Given that order_status is a part of the metric definition, it doesn’t make sense to include in the acceptable dimensions list because the value returned would always be `completed`. - - `order_id`: Each order id corresponds to a single order and a single point in time. Grouping the metric of revenue by order_id would just return the base grain of the table and the same value as the amount field - not useful from a metric perspective! -- **Nuanced Dimensions:** - - `customer_id`: This is an interesting field because it can be both good and bad depending on the context in which it is used and the underlying data. In our example use case, this dimension wouldn’t be that useful - it would contain too many unique values and tracking the individual revenue impact by a single customer doesn’t make sense on a retail scale. - - In a SaaS business though, it might make more sense - especially with usage based pricing. The Customer Success team might be interested in tracking the revenue of certain customers and ensuring that they remain consistent. - -To quote Cameron Afzal, Product Manager of the dbt Semantic Layer: - -> Thoughtful curation of dimensions is essential for three main reasons: -- **Relevance:** Analysts must include the dimensions most relevant to answering the question. -- **Trust**: Curating high-quality dimensions with little to no known errors helps ensure trust in analysis results and the decisions that follow. -- **Efficiency**: Curation provides a faster path to high-quality analysis results. -> - -To put it another way, **metrics are most useful when every dimension provided can help provide answers to the business.** - -## Advanced example: NPS - -### Designing a complex metric - -Now let’s look at a more complex example of a metric - one that is built from components that could theoretically themselves be metrics. The metric in question is Net Promoter score, which is used by the dbt Labs internal analytics team to understand the experience that users are having on dbt Cloud. - -For those of you who are unfamiliar with the industry metric of Net Promoter Score, here is a [great article from the folks over at Delighted on how it is calculated.](https://delighted.com/net-promoter-score) The short version of it is `the percentage of promoters - the percentage of detractors`. - ---- - -Here at dbt Labs we provide users with short surveys where they can provide feedback (as well as in a few other locations). The data is collected from those surveys is used to calculate our NPS Score, which helps us understand user sentiment over time. - -Given that these surveys come from a few different sources, there is a large amount of upstream modeling performed in order to unify them in a single model, but the end result is something that looks like the table below: - -| feedback_date | unique_id | feedback_source | user_type | account_plan | score | nps_category | -| --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | nps_tool_1 | developer | team | 5 | detractor | -| 2022-10-01 | 2 | nps_tool_2 | read_only | developer | 8 | promoter | -| 2022-10-02 | 3 | nps_tool_1 | developer | enterprise | 10 | promoter | -| 2022-10-02 | 4 | nps_tool_1 | developer | developer | 7 | passive | -| 2022-10-02 | 5 | nps_tool_2 | developer | team | 9 | promoter | -| 2022-10-03 | 6 | nps_tool_1 | developer | enterprise | 7 | passive | - -The dbt Internal Analytics team ([long may they reign](https://www.linkedin.com/feed/update/urn:li:activity:6962884130569080833/)) took this data and decided to build the NPS Score metric into our repo so that it could be surfaced to stakeholders in multiple tools. This process is where we began to form our opinions on what should live in the modeling layer vs semantic layer - but these are sure to progress as we add in more and more real world use cases. - -### Option 1: Putting everything in the semantic layer - -If we wanted to store all the logic inside metric definitions, we could use the following code in the Semantic Layer section to create 6 different metrics that result in the NPS Score metric. This would allow end users to retrieve the NPS Score they are interested in a version-controlled, standard way across any of their BI tools of choice. Additionally, it allows users to individually slice/dice any of the component metrics by themselves. - -```yaml -metrics: - - name: total_respondents - label: Total of NPS Respondents - model: ref('customer_nps') - description: 'The count of users responding to NPS surveys in dbt Cloud.' - calculation_method: count - expression: unique_id - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - - - name: total_promoter_respondents - ......... ##same as total_respondents - filters: - - field: nps_category - operator: '=' - value: "'promoter'" - - - name: total_detractor_respondents - ......... ##same as total_respondents - filters: - - field: nps_category - operator: '=' - value: "'detractor'" - - - name: promoters_pct - label: Percent Promoters (Cloud) - description: 'The percent of dbt Cloud users in the promoters segment.' - calculation_method: expression - expression: "{{metric('total_promoter_respondents')}} / {{metric('total_respondents')}}" - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - - - name: detractor_pct - ... ##same as promoters_pct - expression: "{{metric('total_detractor_respondents')}} / {{metric('total_respondents')}}" - - - name: nps_score - label: Net Promoter Score - description: 'The NPS (-1 to 1) of all dbt Cloud users.' - calculation_method: expression - expression: "{{metric('promoters_pct')}} - {{metric('detractors_pct')}}" - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - -``` - -### Option 2: Keeping logic in the modeling layer - -But what if we didn’t want to encode all that information in the metric definitions? If we didn’t need the ability to dig into the component metrics and only wanted to look at the final score? In that case, we could encode most of the logic into the model itself and define the metric on top of that! - -Thinking through this, we know that our NPS Score is a series of ratios dependent on conditions of which category people fall into with the end result being a number between 100 to -100. That number is usually then *displayed* in a percentage format but it is *calculated* as a number. - -So in order to reduce the complexity of metric code, we can add a new field into the model that assigns an `nps_value` to each survey received. The logic for this field would assign a value of 100, 0, or -100 depending on the survey’s `nps_category`. Example code below: - -```sql -case - when nps_category = 'detractor' then -100 - when nps_category = 'promoter' then 100 - else 0 -end as nps_value -``` - -The end result of adding this code to the model would look something like this: - -| feedback_date | unique_id | feedback_source | user_type | account_plan | score | nps_category | nps_value | -| --- | --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | nps_tool_1 | developer | team | 5 | detractor | -100 | -| 2022-10-01 | 2 | nps_tool_2 | read_only | developer | 9 | promoter | 100 | -| 2022-10-02 | 3 | nps_tool_1 | developer | enterprise | 10 | promoter | 100 | -| 2022-10-02 | 4 | nps_tool_1 | developer | developer | 7 | passive | 0 | -| 2022-10-02 | 5 | nps_tool_2 | developer | team | 9 | promoter | 100 | -| 2022-10-03 | 6 | nps_tool_1 | developer | enterprise | 7 | passive | 0 | - -Now that each survey has an associated `nps_value` we can forgo the ratio calculations used in the Metric Logic section and create our NPS Score metric as a single average metric. - -```yaml -metrics: - - name: nps_score - label: NPS Score - model: ref('customer_nps') - calculation_method: average - expression: nps_value - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type -``` - -
- Why does this work? - -This is a slightly different way of calculating NPS from the usually provided formula but it ends up with the same result. Here is why: - -- `promoter_pct` was defined as `total_promoter_respondents` / `total_respondents` - - In our example dataset, this nets out to 3 / 6 = 50%. - - If we instead assign a value of 100 and take the average, it becomes 300 / 6 = 50. -- `detractor_pct` was defined as `total_detractor_respondents` / `total_respondents` - - In our example dataset, this nets out to 1 / 6 = 16.67%. - - If we instead assign a value of 100 and take the average, it becomes -100 / 6 = -16.67. -- Therefore, our `nps_score` follows suit: - - In our example dataset, 50% - 16.67% = 33.33% - - If we instead assign a value of 100 and take the average, it becomes 200 / 6 = 33.33 - -The underlying principle of why this works is based on the fact that averages divide the sum of the values in the set by their number. In more dbt friendly terms, what it really means is that average is creating the following equation: `sum(value)/count(*)`. In the first example implementation, we were doing roughly the same thing with multiple metric definitions - the only difference was our numerator was a count that assigned each row a value of 1. So if we duplicate that logic and give each row a value of 1 then we can create far fewer metrics. - -But that only gets us to the `promoter_pct` and `detractor_pct` metrics. In order to combine these both into a single metric definition, we needed to change the value that we assign. Given that the total range of values that the metric could output is -100 (all detractors) to 100 (all promoters) we can assign each of those categories that peak value, along with 0 for passives. This means that when the numerator is aggregated, it nets out promoters against detractors just like the documented equation does `promoter score - detractor score` . - -
- -**Is this what I should do?** - -[It depends!](https://twitter.com/SBinLondon/status/1413113782214266885) There will be times when it might be better to have logic stored in the modeling layer and there will be times when it might be better to have logic stored in the semantic layer. Our shorthand is to only include logic in the semantic layer if it is needed by our stakeholders - if they don't need to analyze the components, we keep them in the modeling layer. In the end, the needs of your business stakeholders should drive your decision on where to keep this logic. - -## How to structure your metrics - -Now that we’ve designed our metrics, let's move on to structuring them within our project. We'll examine the different ways to organize metrics and take a look at the pros and cons of several strategies. - -### Folder structure - -If you follow [dbt’s best practices for structuring your project](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), you will have a folder structure that looks similar to this: - -```yaml -models: - staging: - intermediate: - marts: -``` - -Your marts folder would most likely contain your end-state models ready for business consumption. Given that metrics are meant for business consumption, we are presented with two options - staying within the same framework or representing metrics as their own level. - -We recommend Option A (metrics within marts) but recognize that some people might prefer Option B (metrics within models). - -**A. Metrics within marts** - -Create a metrics folder within marts and use this to contain all of your metric definitions. - -```yaml -models: - staging: - intermediate: - marts: - metrics: -``` - -**B. Metrics within models** - -Create a metrics folder within models and use this to contain all of your metric definitions. - -```yaml -models: - staging: - intermediate: - marts: - metrics: -``` - -### File structure - -Once you’ve decided ***where*** to put your metrics folder, you can now decide ***how*** you want to structure your metrics within this folder. Choose one of two methods for structuring metrics: - -**Option A: The all-in-one YML method** -This method follows a similar pattern to [dbt’s best practices around model structure](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). The introduction of the metrics folder is the only change from the standard best practice. - -In practice, the all-in-one YML method would look like the following: - -```yaml -## Metrics within Marts -models: - marts: - metrics: - - metrics.yml ------- -## Metrics within Models -models: - metrics: - - metrics.yml -``` - -**Option B: The single-metric-per-file method** -In this method, you create *one* yml file for *each* metric*.* Although this is an opinionated stance that differs from [dbt’s best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), here are some reasons why this **could** be useful: - -- Individual files are more easily discovered by new analytics engineers as your organization expands -- Individual files can more easily define specific code owners that may not be part of the data team. - -For example, Jerry from the Finance department is the code owner for the `revenue` metric definition and oversees it for the business. So, any change to this specific file would need Jerry’s sign-off. - -This can be tricky for code owners who aren’t familiar with your git flow, but it brings them into the chain of responsibility for the metric definition. It also helps them take ownership for reporting on this metric and creates a responsible party when definitions need to change. - -The single-file-code-owner method would look like this: - -```yaml -models: - metrics: - marts: - - revenue.yml - - average_order_value.yml - - some_other_metric_name.yml -``` - -### Folder and file structure is a preference, not a hard rule - -In the end, all of the structuring information above is just a recommendation. Your project probably has a defined convention in how nodes are organized, whether or not it follows dbt best practices, and you should continue to follow your own organizational practices. That said, we do recommend that metrics should be separate from model yml files. The reason? - -**Metrics are important business objects unto themselves and should live separate from the model definitions.** - -## A call to action - -This is just the beginning of dbt metrics and the Semantic Layer. We have a number of exciting ideas for expanding capabilities that we plan to begin work on in the coming months. However, we can’t do that without you. - -This semantic layer is a fundamental change to what it means to interact with dbt and ultimately most of the best practices will come from the dbt Community - folks like you. It does not matter if you consider yourself an "expert" on this - we want to talk to you and hear how you are using or would like to use metrics and the semantic layer. Y’all are going to be our guiding light to help us make sure that all the functionality we add helps **you** serve the needs of your business. - -If your experience with the Semantic Layer match what we’ve written in this post, and especially if they don’t, please share [comments and feedback in this Discourse Discussion](https://discourse.getdbt.com/t/how-to-design-and-structure-metrics/5040)! - -Additionally, I would invite you to join us over at #dbt-core-metrics on the dbt Slack where we’ll be posting updates, answering questions, discussing usage, and hopefully responding with the best emojis. diff --git a/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md b/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md index 95988e75f04..bab92000a16 100644 --- a/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md +++ b/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md @@ -4,8 +4,7 @@ description: "Going polyglot is a major next step in the journey of dbt Core. It slug: polyglot-dbt-python-dataframes-sql authors: [doug_beatty] - -tags: [dbt product updates] +tags: [dbt tutorials] hide_table_of_contents: false date: 2022-10-18 diff --git a/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md b/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md index ba5dddcae19..93cf91efeed 100644 --- a/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md +++ b/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md @@ -102,7 +102,7 @@ Instead of syncing all cells in a sheet, you create a [named range](https://five -Beware of inconsistent data types though—if someone types text into a column that was originally numeric, Fivetran will automatically convert the column to a string type which might cause issues in your downstream transformations. [The recommended workaround](https://fivetran.com/docs/files/google-sheets#typetransformationsandmapping) is to explicitly cast your types in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) to ensure that any undesirable records are converted to null. +Beware of inconsistent data types though—if someone types text into a column that was originally numeric, Fivetran will automatically convert the column to a string type which might cause issues in your downstream transformations. [The recommended workaround](https://fivetran.com/docs/files/google-sheets#typetransformationsandmapping) is to explicitly cast your types in [staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging) to ensure that any undesirable records are converted to null. #### Good fit for: @@ -192,4 +192,4 @@ Databricks also supports [pulling in data, such as spreadsheets, from external c Beyond the options we’ve already covered, there’s an entire world of other tools that can load data from your spreadsheets into your data warehouse. This is a living document, so if your preferred method isn't listed then please [open a PR](https://github.com/dbt-labs/docs.getdbt.com) and I'll check it out. -The most important things to consider are your files’ origins and formats—if you need your colleagues to upload files on a regular basis then try to provide them with a more user-friendly process; but if you just need two computers to talk to each other, or it’s a one-off file that will hardly ever change, then a more technical integration is totally appropriate. \ No newline at end of file +The most important things to consider are your files’ origins and formats—if you need your colleagues to upload files on a regular basis then try to provide them with a more user-friendly process; but if you just need two computers to talk to each other, or it’s a one-off file that will hardly ever change, then a more technical integration is totally appropriate. diff --git a/website/blog/2022-11-30-dbt-project-evaluator.md b/website/blog/2022-11-30-dbt-project-evaluator.md index 558d8877d72..3ea7a459c35 100644 --- a/website/blog/2022-11-30-dbt-project-evaluator.md +++ b/website/blog/2022-11-30-dbt-project-evaluator.md @@ -34,7 +34,7 @@ Throughout these engagements, we began to take note of the common issues many an Maybe your team is facing some of these issues right now 👀 And that’s okay! We know that building an effective, scalable dbt project takes a lot of effort and brain power. Maybe you’ve inherited a legacy dbt project with a mountain of tech debt. Maybe you’re starting from scratch. Either way it can be difficult to know the best way to set your team up for success. Don’t worry, you’re in the right place! -Through solving these problems over and over, the Professional Services team began to hone our best practices for working with dbt and how analytics engineers could improve their dbt project. We added “solutions reviews'' to our list of service offerings — client engagements in which we evaluate a given dbt project and provide specific recommendations to improve performance, save developer time, and prevent misuse of dbt’s features. And in an effort to share these best practices with the wider dbt community, we developed a *lot* of content. We wrote articles on the Developer Blog (see [1](https://docs.getdbt.com/blog/on-the-importance-of-naming), [2](https://discourse.getdbt.com/t/your-essential-dbt-project-checklist/1377), and [3](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview)), gave [Coalesce talks](https://www.getdbt.com/coalesce-2020/auditing-model-layers-and-modularity-with-your-dag/), and created [training courses](https://courses.getdbt.com/courses/refactoring-sql-for-modularity). +Through solving these problems over and over, the Professional Services team began to hone our best practices for working with dbt and how analytics engineers could improve their dbt project. We added “solutions reviews'' to our list of service offerings — client engagements in which we evaluate a given dbt project and provide specific recommendations to improve performance, save developer time, and prevent misuse of dbt’s features. And in an effort to share these best practices with the wider dbt community, we developed a *lot* of content. We wrote articles on the Developer Blog (see [1](https://docs.getdbt.com/blog/on-the-importance-of-naming), [2](https://discourse.getdbt.com/t/your-essential-dbt-project-checklist/1377), and [3](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview)), gave [Coalesce talks](https://www.getdbt.com/coalesce-2020/auditing-model-layers-and-modularity-with-your-dag/), and created [training courses](https://courses.getdbt.com/courses/refactoring-sql-for-modularity). TIme and time again, we found that when teams are aligned with these best practices, their projects are more: @@ -63,10 +63,10 @@ Currently, the dbt_project_evaluator package covers five main categories: | Category | Example Best Practices | | --- | --- | -| Modeling | - Every [raw source](https://docs.getdbt.com/docs/build/sources) has a one-to-one relationship with a [staging model](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) to centralize data cleanup.
- Every model can be traced back to a declared source in the dbt project (i.e. no "root" models).
- End-of-DAG fanout remains under a specified threshold. | +| Modeling | - Every [raw source](https://docs.getdbt.com/docs/build/sources) has a one-to-one relationship with a [staging model](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) to centralize data cleanup.
- Every model can be traced back to a declared source in the dbt project (i.e. no "root" models).
- End-of-DAG fanout remains under a specified threshold. | | Testing | - Every model has a that is appropriately tested.
- The percentage of models that have minimum 1 test applied is greater than or equal to a specified threshold. | | Documentation | - Every model has a [description](https://docs.getdbt.com/reference/resource-properties/description).
- The percentage of models that have a description is greater than or equal to a specified threshold. | -| Structure | - All models are named with the appropriate prefix aligned according to their model types (e.g. staging models are prefixed with `stg_`).
- The sql file for each model is in the subdirectory aligned with the model type (e.g. intermediate models are in an [intermediate subdirectory](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate)).
- Each models subdirectory contains one .yml file that includes tests and documentation for all models within the given subdirectory. | +| Structure | - All models are named with the appropriate prefix aligned according to their model types (e.g. staging models are prefixed with `stg_`).
- The sql file for each model is in the subdirectory aligned with the model type (e.g. intermediate models are in an [intermediate subdirectory](https://docs.getdbt.com/best-practices/how-we-structure/3-intermediate)).
- Each models subdirectory contains one .yml file that includes tests and documentation for all models within the given subdirectory. | | Performance | - Every model that directly feeds into an [exposure](https://docs.getdbt.com/docs/build/exposures) is materialized as a .
- No models are dependent on chains of "non-physically-materialized" models greater than a specified threshold. | For the full up-to-date list of covered rules, check out the package’s [README](https://github.com/dbt-labs/dbt-project-evaluator#rules-1), which outlines for each misalignment of a best practice: diff --git a/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md b/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md index ffc0369a908..3ca1f6ac2a9 100644 --- a/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md +++ b/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md @@ -62,7 +62,7 @@ Before you can get started: - You must have Python 3.8 or above installed - You must have dbt version 1.3.0 or above installed - You should have a basic understanding of [SQL](https://www.sqltutorial.org/) -- You should have a basic understanding of [dbt](https://docs.getdbt.com/quickstarts) +- You should have a basic understanding of [dbt](https://docs.getdbt.com/guides) ### Step 2: Clone the repository diff --git a/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md b/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md index c5b677f7f3e..46cfcb58cdd 100644 --- a/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md +++ b/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md @@ -94,7 +94,7 @@ It is essential to click on each data source (the green book icons on the leftmo For this step, we identified which operators were used in the data source (for example, joining data, order columns, group by, etc). Usually the Alteryx operators are pretty self-explanatory and all the information needed for understanding appears on the left side of the menu. We also checked the documentation to understand how each Alteryx operator works behind the scenes. -We followed dbt Labs' guide on how to refactor legacy SQL queries in dbt and some [best practices](https://docs.getdbt.com/guides/migration/tools/refactoring-legacy-sql). After we finished refactoring all the Alteryx workflows, we checked if the Alteryx output matched the output of the refactored model built on dbt. +We followed dbt Labs' guide on how to refactor legacy SQL queries in dbt and some [best practices](/guides/refactoring-legacy-sql). After we finished refactoring all the Alteryx workflows, we checked if the Alteryx output matched the output of the refactored model built on dbt. #### Step 3: Use the `audit_helper` package to audit refactored data models @@ -131,4 +131,4 @@ As we can see, refactoring Alteryx to dbt was an important step in the direction > > [Audit_helper in dbt: Bringing data auditing to a higher level](https://docs.getdbt.com/blog/audit-helper-for-migration) > -> [Refactoring legacy SQL to dbt](https://docs.getdbt.com/guides/migration/tools/refactoring-legacy-sql) +> [Refactoring legacy SQL to dbt](/guides/refactoring-legacy-sql) diff --git a/website/blog/2023-04-26-deprecating-dbt-metrics.md b/website/blog/2023-04-26-deprecating-dbt-metrics.md index 1041f75eb2b..bf23bb992ad 100644 --- a/website/blog/2023-04-26-deprecating-dbt-metrics.md +++ b/website/blog/2023-04-26-deprecating-dbt-metrics.md @@ -5,7 +5,6 @@ slug: deprecating-dbt-metrics authors: [callum_mccann] -tags: [dbt product updates] hide_table_of_contents: false date: 2023-04-26 diff --git a/website/blog/2023-05-01-evolving-data-engineer-craft.md b/website/blog/2023-05-01-evolving-data-engineer-craft.md index 339d0ac380e..a3113240227 100644 --- a/website/blog/2023-05-01-evolving-data-engineer-craft.md +++ b/website/blog/2023-05-01-evolving-data-engineer-craft.md @@ -5,7 +5,6 @@ slug: evolving-data-engineer-craft authors: [sung_chung, kira_furuichi] -tags: [dbt product updates] hide_table_of_contents: false date: 2023-05-01 diff --git a/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md b/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md index e1351034f66..2a4879ac98d 100644 --- a/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md +++ b/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md @@ -97,7 +97,7 @@ dbt Cloud includes **built-in Git** with accessible features directly from its I The biggest boon to Data Vault developer productivity in dbt Cloud are the **DataOps** and **Data Warehouse Automation** features of dbt Cloud. Each Data Vault developer gets their own development environment to work in and there is no complicated set up process to go through. -Commit your work, create a pull request, and have automated code review enabled by dbt Cloud [**jobs**](https://docs.getdbt.com/docs/deploy/dbt-cloud-job) that can be defined for each environment separately (e.g., testing, QA, production). Together with dbt [**tags**](https://docs.getdbt.com/reference/resource-configs/tags), the feature allows you to orchestrate your project in an efficient and powerful way. +Commit your work, create a pull request, and have automated code review enabled by dbt Cloud [**jobs**](https://docs.getdbt.com/docs/deploy/jobs) that can be defined for each environment separately (e.g., testing, QA, production). Together with dbt [**tags**](https://docs.getdbt.com/reference/resource-configs/tags), the feature allows you to orchestrate your project in an efficient and powerful way. ### Auditable data @@ -115,7 +115,9 @@ In terms of the implementation of the Data Vault itself, we recommend familiariz ### AutomateDV (formerly known as dbtvault) -AutomateDV is the most popular open source Data Vault package for dbt, with some users having over 5000 Data Vault components in their project. Here in Infinite Lambda, we’ve been using this package for quite some time now, even building on top of it (depending on the specifics of the project). This mature system provides a great way to start your Data Vault with dbt Cloud journey as the learning curve is quite manageable, it is well documented and even comes with tutorials and working examples built on top of Snowflake’s TPCH standard dataset. There is one limitation to using the package and that is _AutomateDV _expects your source data to contain only one delta load. In order to work around this issue, owners of the package came up with custom dbt materializations to help you with the initial load of your system, however, the performance of such load is in our experience not acceptable. +AutomateDV is the most popular open source Data Vault package for dbt, with some users having over 5000 Data Vault components in their project. Here in Infinite Lambda, we’ve been using this package for quite some time now, even building on top of it (depending on the specifics of the project). This mature system provides a great way to start your Data Vault with dbt Cloud journey as the learning curve is quite manageable, it is well documented and even comes with tutorials and working examples built on top of Snowflake’s TPCH standard dataset. There is one limitation to using the package and that is _AutomateDV_ expects your source data to contain only one delta load. In order to work around this issue, owners of the package came up with custom dbt materializations to help you with the initial load of your system, however, the performance of such load is in our experience not acceptable. + +_(Editor's note: As of AutomateDV v0.10.0, this performance issue has been resolved and users may use the standard incremental configuration.)_ ### datavault4dbt diff --git a/website/blog/2023-08-01-announcing-materialized-views.md b/website/blog/2023-08-01-announcing-materialized-views.md new file mode 100644 index 00000000000..3917e3f192c --- /dev/null +++ b/website/blog/2023-08-01-announcing-materialized-views.md @@ -0,0 +1,213 @@ +--- +title: "Optimizing Materialized Views with dbt" +description: "In dbt v1.6, we introduce support for materialized views. In this blog post, Amy will review how to use them in your workflow" +slug: announcing-materialized-views + +authors: [amy_chen] + +tags: [analytics craft, dbt product updates, data ecosystem] +hide_table_of_contents: false + +date: 2023-08-03 +is_featured: true +--- + +## Introduction + +The year was 2020. I was a kitten-only household, and dbt Labs was still Fishtown Analytics. A enterprise customer I was working with, Jetblue, asked me for help running their dbt models every 2 minutes to meet a 5 minute SLA. + +After getting over the initial terror, we talked through the use case and soon realized there was a better option. Together with my team, I created [lambda views](https://discourse.getdbt.com/t/how-to-create-near-real-time-models-with-just-dbt-sql/1457%20?) to meet the need. + +Flash forward to 2023. I’m writing this as my giant dog snores next to me (don’t worry the cats have multiplied as well). Jetblue has outgrown lambda views due to performance constraints (a view can only be so performant) and we are at another milestone in dbt’s journey to support streaming. What. a. time. + +Today we are announcing that we now support Materialized Views in dbt. So, what does that mean? + + + +Materialized views are now an out of the box materialization in your dbt project once you upgrade to the latest version of dbt v1.6 on these following adapters: + +- dbt-postgres +- dbt-redshift +- dbt-snowflake +- dbt-databricks +- dbt-materialize* +- dbt-trino* +- dbt-bigquery** + +*These adapters have supported materialized views in their adapter prior 1.6. +**dbt-bigquery support will be coming in 1.7. + +Just like you would materialize your sql model as  `table` or `view`  today, you can use `materialized_view` in your model configuration, dbt_project.yml, and resources.yml files. At release, python models will not be supported. + + + +For Postgres/Redshift/Databricks + +```sql +{{ +config( + materialized = 'materialized_view', +) +}} + +``` + +For Snowflake: +```sql +{{ +config( + materialized = 'dynamic_table', +) +}} +``` + +:::note +We are only supporting dynamic tables on Snowflake, not Snowflake’s materialized views (for a comparison between Snowflake Dynamic Tables and Materialized Views, refer [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-comparison#dynamic-tables-compared-to-materialized-views). Dynamic tables are better suited for continuous transformations due to functionality like the ability to join, union, and aggregate on base tables, views , and other dynamic tables. Due to those features, they are also more aligned with what other data platforms are calling Materialized Views. For the sake of simplicity, when I refer to materialized views in this blog, I mean dynamic tables in Snowflake. +::: + +Now that we support materialized views: how do you fit them into your dbt workflow? It’s easy to imagine a world of unregulated computation because you didn’t put in proper guardrails and now you have materialized views running rampant unbeknownst to you in your data platform. + +Materialized views, just like any other materialization, fit a need and you should utilize them while taking into consideration the additional complexity they will add to your project. They are a tool in your analytics engineering toolbox, one of many. + +In this blog, we will go over when to pull this tool out of your toolbox, how to wield it successfully, and how to promote materialized views with governance in mind. Now this is a new functionality and I expect this to be the first of many posts to come, defining our best practices (or even redefining them). Also I will not be discussing dbt’s interactions upstream from the data platform like how to manage your Kafka topics using dbt, but would highly recommend [this post from Charlie Summers](https://docs.getdbt.com/blog/demystifying-event-streams) if that’s something you’re interested in. + +Additionally, if you want to get a more detailed understanding of your data platform’s support of materialized views, I recommend checking out dbt’s and your data platform’s documentation site. This blog post is intended to be a high level, platform agnostic overview to get you started. + +## What are Materialized Views? + +Starting out with, **what are materialized views (MVs)?** While specific features will vary by data platform, materialized views at their core are database objects that have stored the results of a query as a physically materialized table. What makes them distinct from a regular table is that the data in a materialized view is periodically refreshed to reflect the latest changes in the underlying table. Because they’re precomputed and the results are stored, you have faster query times when accessing them because you aren’t recomputing the data from scratch. This is great when you have low latency requirements for your data pipelines. + +Now you might have noticed that MVs sound a lot like incremental models, and you are not wrong! It can be worthwhile to think of materialized views as a successor of sorts to incremental models. In fact, depending on your needs and data platform of choice, you might wish to replace all of your incremental dbt models with materialized view models. By doing this, you will no longer have to manually craft specific incremental strategies, detailing how dbt should update the underlying table. Awesome, right? + +The tradeoff (outside of any data platform specific ones) is that you will have less fine-grained control over the incremental logic and orchestration. This is because you are handing defining the logic of what and how to update the existing table over to the data platform to perform for you. + +Other factors to consider when deciding on when/how to use a materialized view: +- What are the costs associated with running the materialized view versus a batched incremental model? (this will vary depending on your data platform as some will require different compute nodes) +- Does your data platform support joins, aggregations, and window functions on MVs if you need them? +- What are the latency needs of your development environment? In production? (If not near real time, you can make the choice between a batch incremental model or a MV with a longer refresh schedule.) +- How often do your upstream dependencies update? If your answer is `not frequent`, you may not need a MV. +- How large is your dataset?(It might be cheaper to use MVs for extremely large datasets) +- How often do you need your query refreshed? What are your downstream dependencies and their stakeholders? (If near real time is important, MVs might be the right choice). +- Do you have real time machine learning models training or applications using your transformed dataset? + +## Materialized Views in the dbt Workflow + +### Development + +When we talk about using materialized views in development, the question to think about is not so much “should you execute your dbt models as materialized views in your sandbox?,” but rather “should you schedule them to refresh in your sandbox?”. For development, you do need to create them and test them out in your sandbox but how do you do this in a way that doesn’t drive up your cloud bill unnecessarily? Or keeping a post-it note on your laptop as a reminder to drop all of the running materialized views in your sandbox before you sign off? Let’s talk about it! + +Outside of the scheduling part, development will be pretty standard. Your pipeline is likely going to look something like this: + + + +This is assuming you have a near real time pipeline where you are pulling from a streaming data source like a Kafka Topic via an ingestion tool of your choice like Snowpipe for Streaming into your data platform. After your data is in the data platform, you will: + +1. Create the dbt model with the SQL transformation logic that you need. +2. Look at the logic and answer these questions: + 1. Does my data platform support the functionality I need in materialized views? + 2. How often do you need the data refreshed? Do you need any flexibility in that? + 3. How am I promoting this into production? Either you will run the transformation logic in the production environment (recommended) and create a separate object or promote the object created from development. + + +Depending on your answer, this will decide if you want a materialized view in the first place (versus a view, table, or incremental model). If you have decided on a materialized view as meeting your needs, by default do not schedule a refresh. You can run manual refreshes as needed. Why’s that? In your development environment, you are likely validating three things: the dependencies, the SQL logic, and the transformation output. All of those can be tested by creating a materialized view without scheduling and running manually refreshes. + +Your configuration during development: + +For Postgres: + +Every time you run a `dbt run`, that will result in a manual refresh unless you set the `on_configuration_change` to `continue` which will skip running the model. + +```sql +{{ +config( + materialized = 'materialized_view', + on_configuration_change = 'apply', +) +}} +``` + +For Redshift: + +```sql +{{ +config( + materialized = 'materialized_view', + on_configuration_change = 'apply', + auto_refresh = False +) +}} +``` + +For Databricks: + +```sql +{{ +config( + materialized='materialized_view', +) +}} +``` + +By default, materialized views are not refreshed on a schedule on Databricks in this materialization. To set up scheduling, you can use a post-hook to alter the MV with a cron schedule that will run in Databricks Workflows. That could look like something like this + +```sql +post_hook = 'ALTER MATERIALIZED VIEW {{this}} ADD SCHEDULE CRON "0 0 0 * * ? *" AT TIME ZONE "America/Los_Angeles";' +``` + +For Snowflake: + +```sql +{{ +config( + materialized = 'dynamic_table', + snowflake_warehouse = '', + target_lag = '', + on_configuration_change = 'apply', +) +}} +``` + +Now if you do need to more fully build out your development pipeline (making sure scheduling/syncs do happen), you can schedule but make sure to drop the materialized views when you are done with them. I encourage you to invest in an operations macro that drops all MVs in the schema that you use as your sandbox and run it as needed. You could even create a dbt Cloud job to manage that. This way, you don’t have any stray MVs running in your sandbox, consuming credits unnecessarily. + +### Testing + +Now let’s dive into the second question: how do you test? In development and QA, this will look the same as any batch run tests. You can run `dbt build` or  `dbt test` and then have the tests run after execution as validation. But in production, what can you do to continually test? Your options are: + +- Continue to do batch testing as we wait for [materialized tests](https://github.com/dbt-labs/dbt-core/issues/6914) +- Or overriding the –store-failures macro like what Materialize has created [here](https://materialize.com/blog/real-time-data-quality-tests-using-dbt-and-materialize/) for their adapter to materialize failed rows as a materialized view. This is not a great solution for the long term but if you have urgency to put this into production, it is an option. + +In order to promote materialized views into production, the process will look very much like it did with your incremental models. Using SlimCI, for new MVs, you can build them into your QA environment. For existing MVs without changes, we can skip and defer to the production objects. + +### Production + +When you feel satisfied with your development and testing, for data platforms that offer scheduling via our dbt configurations, you have two options: hardcode the refresh cadence or write in conditional logic based on the environment for the refresh cadence. I recommend using the latter. + +The code for having a conditional in your config block looks like this if you want to include in a macro for either the lag or other fields (snowflake_warehouse, auto_refresh,etc): + +```sql +{% macro target_lag_environment() %} +{% set lag = '1 minute' if target.name == "prod" else '35 days' %} +{{ return(lag) }} +{% endmacro %} +``` + +```sql +{{ +config( + materialized = 'dynamic_table', + snowflake_warehouse = 'transforming', + target_lag = target_lag_environment(), + on_configuration_change = 'apply', +) +}} +``` + +You will want a very long lag for development; I recommend the cadence you drop and refresh your development environment. Here I just chose my two favorite numbers. + +For orchestration, if your materialized views aren’t able to auto refresh, you can use dbt cloud to orchestrate your refreshes. The beauty of materialized views is that dbt will be able to provide the dependency/testing/documentation but also skip or rerun the models as configured, enabling you to version control your logic. Reasonable guardrails for the modern data stack. ✨ + +Depending on how you orchestrate your materialized views, you can either run the testing in production as part of a scheduled job (with dbt test or dbt build). + +## Conclusion + +Well, I’m excited for everyone to remove the lines in your packages.yml that installed your experimental package (at least if you’re using it for MVs) and start to get your hands dirty. We are still new in our journey and I look forward to hearing all the things you are creating and how we can better our best practices in this. \ No newline at end of file diff --git a/website/blog/2023-10-31-to-defer-or-to-clone.md b/website/blog/2023-10-31-to-defer-or-to-clone.md new file mode 100755 index 00000000000..a39fc3ac0b7 --- /dev/null +++ b/website/blog/2023-10-31-to-defer-or-to-clone.md @@ -0,0 +1,118 @@ +--- + +title: To defer or to clone, that is the question +description: "In dbt v1.6, we introduce support for zero-copy cloning via the new dbt clone command. In this blog post, Kshitij will cover what clone is, how it is different from deferral, and when to use each." +slug: to-defer-or-to-clone + +image: /img/blog/2023-10-31-to-defer-or-to-clone/preview.png + +authors: [kshitij_aranke, doug_beatty] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-10-31 +is_featured: true + +--- + +Hi all, I’m Kshitij, a senior software engineer on the Core team at dbt Labs. +One of the coolest moments of my career here thus far has been shipping the new `dbt clone` command as part of the dbt-core v1.6 release. + +However, one of the questions I’ve received most frequently is guidance around “when” to clone that goes beyond [the documentation on “how” to clone](https://docs.getdbt.com/reference/commands/clone). +In this blog post, I’ll attempt to provide this guidance by answering these FAQs: + +1. What is `dbt clone`? +2. How is it different from deferral? +3. Should I defer or should I clone? + +## What is `dbt clone`? + +`dbt clone` is a new command in dbt 1.6 that leverages native zero-copy clone functionality on supported warehouses to **copy entire schemas for free, almost instantly**. + +### How is this possible? + +Well, the warehouse “cheats” by only copying metadata from the `source` schema to the `target` schema; the underlying data remains at rest during this operation. +This metadata includes materialized objects like tables and views, which is why you see a **clone** of these objects in the target schema. + +In computer science jargon, `clone` makes a copy of the pointer from the `source` schema to the underlying data; after the operation there are now two pointers (`source` and `target` schemas) that each point to the same underlying data. + +## How is cloning different from deferral? + +On the surface, cloning and deferral seem similar – **they’re both ways to save costs in the data warehouse.** +They do this by bypassing expensive model re-computations – clone by [eagerly copying](https://en.wikipedia.org/wiki/Evaluation_strategy#Eager_evaluation) an entire schema into the target schema, and defer by [lazily referencing](https://en.wikipedia.org/wiki/Lazy_evaluation) pre-built models in the source schema. + +Let’s unpack this sentence and explore its first-order effects: + +| | defer | clone | +|---------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------| +| **How do I use it?** | Implicit via the `--defer` flag | Explicit via the `dbt clone` command | +| **What are its outputs?** | Doesn't create any objects itself, but dbt might create objects in the target schema if they’ve changed from those in the source schema. | Copies objects from source schema to target schema in the data warehouse, which are persisted after operation is finished. | +| **How does it work?** | Compares manifests between source and target dbt runs and overrides ref to resolve models not built in the target run to point to objects built in the source run. | Uses zero-copy cloning if available to copy objects from source to target schemas, else creates pointer views (`select * from my_model`) | + +These first-order effects lead to the following second-order effects that truly distinguish clone and defer from each other: + +| | defer | clone | +|--------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------| +| **Where can I use objects built in the target schema?** | Only within the context of dbt | Any downstream tool (e.g. BI) | +| **Can I safely modify objects built in the target schema?** | No, since this would modify production data | Yes, cloning is a cheap way to create a sandbox of production data for experimentation | +| **Will data in the target schema drift from data in the source schema?** | No, since deferral will always point to the latest version of the source schema | Yes, since clone is a point-in-time operation | +| **Can I use multiple source schemas at once?** | Yes, defer can dynamically switch between source schemas e.g. ref unchanged models from production and changed models from staging | No, clone copies objects from one source schema to one target schema | + +## Should I defer or should I clone? + +Putting together all the points above, here’s a handy cheat sheet for when to defer and when to clone: + +| | defer | clone | +|---------------------------------------------------------------------------|-------|-------| +| **Save time & cost by avoiding re-computation** | ✅ | ✅ | +| **Create database objects to be available in downstream tools (e.g. BI)** | ❌ | ✅ | +| **Safely modify objects in the target schema** | ❌ | ✅ | +| **Avoid creating new database objects** | ✅ | ❌ | +| **Avoid data drift** | ✅ | ❌ | +| **Support multiple dynamic sources** | ✅ | ❌ | + +To absolutely drive this point home: + +1. If you send someone this cheatsheet by linking to this page, you are deferring to this page +2. If you print out this page and write notes in the margins, you have cloned this page + +## Putting it in practice + +Using the cheat sheet above, let’s explore a few common scenarios and explore whether we should use defer or clone for each: + +1. **Testing staging datasets in BI** + + In this scenario, we want to: + 1. Make a copy of our production dataset available in our downstream BI tool + 2. To safely iterate on this copy without breaking production datasets + + Therefore, we should use **clone** in this scenario + +2. **[Slim CI](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603)** + + In this scenario, we want to: + 1. Refer to production models wherever possible to speed up continuous integration (CI) runs + 2. Only run and test models in the CI staging environment that have changed from the production environment + 3. Reference models from different environments – prod for unchanged models, and staging for modified models + + Therefore, we should use **defer** in this scenario + +3. **[Blue/Green Deployments](https://discourse.getdbt.com/t/performing-a-blue-green-deploy-of-your-dbt-project-on-snowflake/1349)** + + In this scenario, we want to: + 1. Ensure that all tests are always passing on the production dataset, even if that dataset is slightly stale + 2. Atomically rollback a promotion to production if tests aren’t passing across the entire staging dataset + + In this scenario, we can use **clone** to implement a deployment strategy known as blue-green deployments where we build the entire staging dataset and then run tests against it, and only clone it over to production if all tests pass. + + +As a rule of thumb, deferral lends itself better to continuous integration (CI) use cases whereas cloning lends itself better to continuous deployment (CD) use cases. + +## Wrapping Up + +In this post, we covered what `dbt clone` is, how it is different from deferral, and when to use each. Often, they can be used together within the same project in different parts of the deployment lifecycle. + +Thanks for reading, and I look forward to seeing what you build with `dbt clone`. + +*Thanks to [Jason Ganz](https://docs.getdbt.com/author/jason_ganz) and [Gwen Windflower](https://www.linkedin.com/in/gwenwindflower/) for reviewing drafts of this article* diff --git a/website/blog/2023-11-14-specify-prod-environment.md b/website/blog/2023-11-14-specify-prod-environment.md new file mode 100644 index 00000000000..c6ad2b31027 --- /dev/null +++ b/website/blog/2023-11-14-specify-prod-environment.md @@ -0,0 +1,73 @@ +--- + +title: Why you should specify a production environment in dbt Cloud +description: "The bottom line: You should split your Environments in dbt Cloud based on their purposes (e.g. Production and Staging/CI) and mark one environment as Production. This will improve your CI experience and enable you to use dbt Explorer." +slug: specify-prod-environment + +authors: [joel_labes] + +tags: [dbt Cloud] +hide_table_of_contents: false + +date: 2023-11-14 +is_featured: false + +--- + +:::tip The Bottom Line: +You should [split your Jobs](#how) across Environments in dbt Cloud based on their purposes (e.g. Production and Staging/CI) and set one environment as Production. This will improve your CI experience and enable you to use dbt Explorer. +::: + +[Environmental segmentation](/docs/environments-in-dbt) has always been an important part of the analytics engineering workflow: + +- When developing new models you can [process a smaller subset of your data](/reference/dbt-jinja-functions/target#use-targetname-to-limit-data-in-dev) by using `target.name` or an environment variable. +- By building your production-grade models into [a different schema and database](https://docs.getdbt.com/docs/build/custom-schemas#managing-environments), you can experiment in peace without being worried that your changes will accidentally impact downstream users. +- Using dedicated credentials for production runs, instead of an analytics engineer's individual dev credentials, ensures that things don't break when that long-tenured employee finally hangs up their IDE. + +Historically, dbt Cloud required a separate environment for _Development_, but was otherwise unopinionated in how you configured your account. This mostly just worked – as long as you didn't have anything more complex than a CI job mixed in with a couple of production jobs – because important constructs like deferral in CI and documentation were only ever tied to a single job. + +But as companies' dbt deployments have grown more complex, it doesn't make sense to assume that a single job is enough anymore. We need to exchange a job-oriented strategy for a more mature and scalable environment-centric view of the world. To support this, a recent change in dbt Cloud enables project administrators to [mark one of their environments as the Production environment](/docs/deploy/deploy-environments#set-as-production-environment-beta), just as has long been possible for the Development environment. + +Explicitly separating your Production workloads lets dbt Cloud be smarter with the metadata it creates, and is particularly important for two new features: dbt Explorer and the revised CI workflows. + + + +## Make sure dbt Explorer always has the freshest information available + +**The old way**: Your dbt docs site was based on a single job's run. + +**The new way**: dbt Explorer uses metadata from across every invocation in a defined Production environment to build the richest and most up-to-date understanding of your project. + +Because dbt docs could only be updated by a single predetermined job, users who needed their documentation to immediately reflect changes deployed throughout the day (regardless of which job executed them) would find themselves forced to run a dedicated job which did nothing other than run `dbt docs generate` on a regular schedule. + +The Discovery API that powers dbt Explorer ingests all metadata generated by any dbt invocation, which means that it can always be up to date with the applied state of your project. However it doesn't make sense for dbt Explorer to show docs based on a PR that hasn't been merged yet. + +To avoid this conflation, you need to mark an environment as the Production environment. All runs completed in _that_ environment will contribute to dbt Explorer's, while others will be excluded. (Future versions of Explorer will support environment selection, so that you can preview your documentation changes as well.) + +## Run Slimmer CI than ever with environment-level deferral + +**The old way**: [Slim CI](/guides/set-up-ci?step=2) deferred to a single job, and would only detect changes as of that job's last build time. + +**The new way**: Changes are detected regardless of the job they were deployed in, removing false positives and overbuilding of models in CI. + +Just like dbt docs, relying on a single job to define your state for comparison purposes leads to a choice between unnecessarily rebuilding models which were deployed by another job, or creating a dedicated job that runs `dbt compile` on repeat to keep on top of all changes. + +By using the environment as the arbiter of state, any time a change is made to your Production deployment it will immediately be taken into consideration by subsequent Slim CI runs. + +## The easiest way to break apart your jobs {#how} + + + +For most projects, changing from a job-centric to environment-centric approach to metadata is straightforward and immediately pays dividends as described above. Assuming that your Staging/CI and Production jobs are currently intermingled, you can extricate them as follows: + +1. Create a new dbt Cloud environment called Staging +2. For each job that belongs to the Staging environment, edit the job and update its environment +3. Tick the ["Mark as Production environment" box](/docs/deploy/deploy-environments#set-as-production-environment-beta) in your original environment's settings + +## Conclusion + +Until very recently, I only thought of Environments in dbt Cloud as a way to use different authentication credentials in different contexts. And until very recently, I was mostly right. + +Not anymore. The metadata dbt creates is critical for effective data teams – whether you're concerned about cost savings, discoverability, increased development speed or reliable results across your organization – but is only fully effective if it's segmented by the environment that created it. + +Take a few minutes to clean up your environments - it'll make all the difference. diff --git a/website/blog/authors.yml b/website/blog/authors.yml index e9b48bd02fc..31d69824ed4 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,6 +1,6 @@ amy_chen: image_url: /img/blog/authors/achen.png - job_title: Senior Partner Engineer + job_title: Staff Partner Engineer links: - icon: fa-linkedin url: https://www.linkedin.com/in/yuanamychen/ @@ -306,6 +306,15 @@ kira_furuichi: name: Kira Furuichi organization: dbt Labs +kshitij_aranke: + image_url: /img/blog/authors/kshitij-aranke.jpg + job_title: Senior Software Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/aranke/ + name: Kshitij Aranke + organization: dbt Labs + lauren_benezra: image_url: /img/blog/authors/lauren-benezra.jpeg job_title: Analytics Engineer diff --git a/website/blog/categories.yml b/website/blog/categories.yml index 2a45e6529e2..45acf246dff 100644 --- a/website/blog/categories.yml +++ b/website/blog/categories.yml @@ -15,11 +15,9 @@ display_title: dbt tutorials description: Best practices in the usage of our favorite data transformation tool. is_featured: true -- name: dbt updates - display_title: dbt product updates - description: An archive of monthly product updates from the dbt Labs team. - is_featured: true - name: SQL magic display_title: SQL magic description: Stories of dbt developers making SQL sing across warehouses. is_featured: true +- name: dbt Cloud + description: Using dbt Cloud to build for scale \ No newline at end of file diff --git a/website/blog/ctas.yml b/website/blog/ctas.yml index 1b3fad79b80..6b8c04e0ee3 100644 --- a/website/blog/ctas.yml +++ b/website/blog/ctas.yml @@ -10,3 +10,13 @@ subheader: Check out guides on getting your warehouse set up and connected to dbt Cloud. button_text: Learn more url: https://docs.getdbt.com/quickstarts +- name: coalesce_2023_signup + header: Join data practitioners worldwide at Coalesce 2023 + subheader: Kicking off on October 16th, both online and in-person (Sydney, London, and San Diego) + button_text: Register now + url: https://coalesce.getdbt.com/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_coalesce-2023_aw&utm_content=coalesce____&utm_term=all___ +- name: coalesce_2023_catchup + header: Missed Coalesce 2023? + subheader: Watch Coalesce 2023 highlights and full sessions, dbt Labs' annual analytics engineering conference. + button_text: Watch the talks + url: https://www.youtube.com/playlist?list=PL0QYlrC86xQnT3HLh-XgvoTf9F3lbsADf diff --git a/website/blog/metadata.yml b/website/blog/metadata.yml index 20fa93f8d7e..032ab5a760c 100644 --- a/website/blog/metadata.yml +++ b/website/blog/metadata.yml @@ -2,7 +2,7 @@ featured_image: "" # This CTA lives in right sidebar on blog index -featured_cta: "staging" +featured_cta: "coalesce_2023_catchup" # Show or hide hero title, description, cta from blog index show_title: true diff --git a/website/dbt-versions.js b/website/dbt-versions.js index 82de0ad6333..be55c893041 100644 --- a/website/dbt-versions.js +++ b/website/dbt-versions.js @@ -1,4 +1,8 @@ exports.versions = [ + { + version: "1.7", + EOLDate: "2024-10-30", + }, { version: "1.6", EOLDate: "2024-07-31", @@ -19,17 +23,65 @@ exports.versions = [ version: "1.2", EOLDate: "2023-07-26", }, +] + +exports.versionedPages = [ { - version: "1.1", - EOLDate: "2023-04-28", + "page": "reference/resource-configs/store_failures_as", + "firstVersion": "1.7", }, { - version: "1.0", - EOLDate: "2022-12-03" + "page": "docs/build/build-metrics-intro", + "firstVersion": "1.6", + }, + { + "page": "docs/build/sl-getting-started", + "firstVersion": "1.6", + }, + { + "page": "docs/build/about-metricflow", + "firstVersion": "1.6", + }, + { + "page": "docs/build/join-logic", + "firstVersion": "1.6", + }, + { + "page": "docs/build/validation", + "firstVersion": "1.6", + }, + { + "page": "docs/build/semantic-models", + "firstVersion": "1.6", + }, + { + "page": "docs/build/group-by", + "firstVersion": "1.6", + }, + { + "page": "docs/build/entities", + "firstVersion": "1.6", + }, + { + "page": "docs/build/metrics-overview", + "firstVersion": "1.6", + }, + { + "page": "docs/build/cumulative", + "firstVersion": "1.6", + }, + { + "page": "docs/build/derived", + "firstVersion": "1.6", + }, + { + "page": "docs/build/measure-proxy", + "firstVersion": "1.6", + }, + { + "page": "docs/build/ratio", + "firstVersion": "1.6", }, -] - -exports.versionedPages = [ { "page": "reference/commands/clone", "firstVersion": "1.6", @@ -123,69 +175,9 @@ exports.versionedPages = [ "firstVersion": "1.2", }, { - "page": "docs/contributing/testing-a-new-adapter", - "firstVersion": "1.1", - }, - { - "page": "reference/dbt-jinja-functions/selected_resources", - "firstVersion": "1.1", - }, - { - "page": "reference/dbt-jinja-functions/print", - "firstVersion": "1.1", - }, - { - "page": "docs/build/build-metrics-intro", - "firstVersion": "1.6", - }, - { - "page": "docs/build/sl-getting-started", - "firstVersion": "1.6", - }, - { - "page": "docs/build/about-metricflow", - "firstVersion": "1.6", - }, - { - "page": "docs/build/join-logic", - "firstVersion": "1.6", - }, - { - "page": "docs/build/validation", - "firstVersion": "1.6", - }, - { - "page": "docs/build/semantic-models", - "firstVersion": "1.6", - }, - { - "page": "docs/build/group-by", - "firstVersion": "1.6", - }, - { - "page": "docs/build/entities", - "firstVersion": "1.6", - }, - { - "page": "docs/build/metrics-overview", - "firstVersion": "1.6", - }, - { - "page": "docs/build/cumulative", - "firstVersion": "1.6", - }, - { - "page": "docs/build/derived", - "firstVersion": "1.6", - }, - { - "page": "docs/build/measure-proxy", - "firstVersion": "1.6", - }, - { - "page": "docs/build/ratio", - "firstVersion": "1.6", - }, + "page": "docs/build/saved-queries", + "firstVersion": "1.7", + } ] exports.versionedCategories = [ diff --git a/website/docs/guides/legacy/best-practices.md b/website/docs/best-practices/best-practice-workflows.md similarity index 97% rename from website/docs/guides/legacy/best-practices.md rename to website/docs/best-practices/best-practice-workflows.md index 0aad86dd2bc..f06e785c6db 100644 --- a/website/docs/guides/legacy/best-practices.md +++ b/website/docs/best-practices/best-practice-workflows.md @@ -1,11 +1,12 @@ --- -title: "Best practices" -id: "best-practices" +title: "Best practices for workflows" +id: "best-practice-workflows" --- This page contains the collective wisdom of experienced users of dbt on how to best use it in your analytics work. Observing these best practices will help your analytics team work as effectively as possible, while implementing the pro-tips will add some polish to your dbt projects! ## Best practice workflows + ### Version control your dbt project All dbt projects should be managed in version control. Git branches should be created to manage development of new features and bug fixes. All code changes should be reviewed by a colleague (or yourself) in a Pull Request prior to merging into `master`. @@ -57,7 +58,7 @@ All subsequent data models should be built on top of these models, reducing the Earlier versions of this documentation recommended implementing “base models” as the first layer of transformation, and gave advice on the SQL within these models. We realized that while the reasons behind this convention were valid, the specific advice around "base models" represented an opinion, so we moved it out of the official documentation. -You can instead find our opinions on [how we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). +You can instead find our opinions on [how we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview). ::: @@ -108,12 +109,10 @@ We often: When developing, it often makes sense to only run the model you are actively working on and any downstream models. You can choose which models to run by using the [model selection syntax](/reference/node-selection/syntax). ### Run only modified models to test changes ("slim CI") -To merge code changes with confidence, you want to know that those changes will not cause breakages elsewhere in your project. For that reason, we recommend running models and tests in a sandboxed environment, separated from your production data, as an automatic check in your git workflow. (If you use GitHub and dbt Cloud, read about [how to set up CI jobs](/docs/deploy/slim-ci-jobs). +To merge code changes with confidence, you want to know that those changes will not cause breakages elsewhere in your project. For that reason, we recommend running models and tests in a sandboxed environment, separated from your production data, as an automatic check in your git workflow. (If you use GitHub and dbt Cloud, read about [how to set up CI jobs](/docs/deploy/ci-jobs). At the same time, it costs time (and money) to run and test all the models in your project. This inefficiency feels especially painful if your PR only proposes changes to a handful of models. -New in v0.18.0 - By comparing to artifacts from a previous production run, dbt can determine which models are modified and build them on top of of their unmodified parents. @@ -122,8 +121,6 @@ dbt run -s state:modified+ --defer --state path/to/prod/artifacts dbt test -s state:modified+ --defer --state path/to/prod/artifacts ``` -New in v1.0.0 - By comparing to artifacts from a previous production run, dbt can determine model and test result statuses. - `result:fail` @@ -159,13 +156,6 @@ dbt test --select result:fail --exclude --defer --state path/to/p > Note: If you're using the `--state target/` flag, `result:error` and `result:fail` flags can only be selected concurrently(in the same command) if using the `dbt build` command. `dbt test` will overwrite the `run_results.json` from `dbt run` in a previous command invocation. - - -Only supported by v1.1 or newer. - - - - Only supported by v1.1 or newer. @@ -184,8 +174,6 @@ dbt source freshness # must be run again to compare current to previous state dbt build --select source_status:fresher+ --state path/to/prod/artifacts ``` - - To learn more, read the docs on [state](/reference/node-selection/syntax#about-node-selection). ## Pro-tips for dbt Projects diff --git a/website/docs/guides/best-practices/custom-generic-tests.md b/website/docs/best-practices/custom-generic-tests.md similarity index 95% rename from website/docs/guides/best-practices/custom-generic-tests.md rename to website/docs/best-practices/custom-generic-tests.md index dc23770423e..f2d84e38853 100644 --- a/website/docs/guides/best-practices/custom-generic-tests.md +++ b/website/docs/best-practices/custom-generic-tests.md @@ -6,13 +6,6 @@ displayText: Writing custom generic tests hoverSnippet: Learn how to define your own custom generic tests. --- - - -* `v0.20.0`: Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. They return a number of failing rows, rather than a single numeric value. -* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory - - - dbt ships with [Not Null](/reference/resource-properties/tests#not-null), [Unique](/reference/resource-properties/tests#unique), [Relationships](/reference/resource-properties/tests#relationships), and [Accepted Values](/reference/resource-properties/tests#accepted-values) generic tests. (These used to be called "schema tests," and you'll still see that name in some places.) Under the hood, these generic tests are defined as `test` blocks (like macros) in a globally accessible dbt project. You can find the source code for these tests in the [global project](https://github.com/dbt-labs/dbt-core/tree/main/core/dbt/include/global_project/macros/generic_test_sql). :::info diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices.md b/website/docs/best-practices/dbt-unity-catalog-best-practices.md similarity index 85% rename from website/docs/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices.md rename to website/docs/best-practices/dbt-unity-catalog-best-practices.md index 8713938db86..89153fe1b86 100644 --- a/website/docs/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices.md +++ b/website/docs/best-practices/dbt-unity-catalog-best-practices.md @@ -1,6 +1,13 @@ -# Best practices for dbt and Unity Catalog +--- +title: "Best practices for dbt and Unity Catalog" +id: "dbt-unity-catalog-best-practices" +description: Learn how to configure your. +displayText: Writing custom generic tests +hoverSnippet: Learn how to define your own custom generic tests. +--- -Your Databricks dbt project should be configured after following the ["How to set up your databricks dbt project guide"](how-to-set-up-your-databricks-dbt-project). Now we’re ready to start building a dbt project using Unity Catalog. However, we should first consider how we want to allow dbt users to interact with our different catalogs. We recommend the following best practices to ensure the integrity of your production data: + +Your Databricks dbt project should be configured after following the ["How to set up your databricks dbt project guide"](/guides/set-up-your-databricks-dbt-project). Now we’re ready to start building a dbt project using Unity Catalog. However, we should first consider how we want to allow dbt users to interact with our different catalogs. We recommend the following best practices to ensure the integrity of your production data: ## Isolate your Bronze (aka source) data @@ -53,9 +60,9 @@ Ready to start transforming your Unity Catalog datasets with dbt? Check out the resources below for guides, tips, and best practices: -- [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) +- [How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) - [Self-paced dbt fundamentals training videos](https://courses.getdbt.com/courses/fundamentals) -- [Customizing CI/CD](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/1-cicd-background) & [SQL linting](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/2-lint-on-push) -- [Debugging errors](https://docs.getdbt.com/guides/best-practices/debugging-errors) -- [Writing custom generic tests](https://docs.getdbt.com/guides/best-practices/writing-custom-generic-tests) -- [dbt packages hub](https://hub.getdbt.com/) \ No newline at end of file +- [Customizing CI/CD](/guides/custom-cicd-pipelines) +- [Debugging errors](/guides/debug-errors) +- [Writing custom generic tests](/best-practices/writing-custom-generic-tests) +- [dbt packages hub](https://hub.getdbt.com/) diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md new file mode 100644 index 00000000000..19c6717063c --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md @@ -0,0 +1,38 @@ +--- +title: "Intro to MetricFlow" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +:::tip +**This is a guide for a beta product.** We anticipate this guide will evolve alongside the Semantic Layer through community collaboration. We welcome discussions, ideas, issues, and contributions to refining best practices. +::: + +Flying cars, hoverboards, and true self-service analytics: this is the future we were promised. The first two might still be a few years out, but real self-service analytics is here today. With dbt Cloud's Semantic Layer, you can resolve the tension between accuracy and flexibility that has hampered analytics tools for years, empowering everybody in your organization to explore a shared reality of metrics. Best of all for analytics engineers, building with these new tools will significantly [DRY](https://docs.getdbt.com/terms/dry) up and simplify your codebase. As you'll see, the deep interaction between your dbt models and the Semantic Layer make your dbt project the ideal place to craft your metrics. + +## Learning goals + +- ❓ Understand the **purpose and capabilities** of the **dbt Semantic Layer**, particularly MetricFlow as the engine that powers it. +- 🧱 Familiarity with the core components of MetricFlow — **semantic models and metrics** — and how they work together. +- 🛠️ Hands-on **experience building** semantic models and metrics in dbt Cloud. +- 🔁 Know how to **refactor** models for MetricFlow. +- 🏅 Aware of new **best practices** to take maximum advantage of the Semantic Layer. + +## Guide structure overview + +We'll work through our learning goals via an [example project](https://github.com/dbt-labs/jaffle-sl-template), we encourage you to follow along and try the code out for yourself if you'd like on the `start-here` branch, or you can just follow along with the completed state of the codebase on the `main` branch. + +1. Getting **setup** with MetricFlow in your dbt project. +2. Building your first **semantic model** and its fundamental parts: **entities, dimensions, and measures**. +3. Building your first **metric**. +4. **Refactoring** a mart into the Semantic Layer. +5. Defining **advanced metrics**: `ratio` and `derived` types. +6. Review **best practices**. + +If you're ready to ship your users more power with less code, let's dive in! + +:::info +MetricFlow is a new way to define metrics in dbt and one of the key components of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). It handles SQL query construction and defines the specification for dbt semantic models and metrics. + +To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). +::: diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md new file mode 100644 index 00000000000..ffbd78b939c --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md @@ -0,0 +1,43 @@ +--- +title: "Set up MetricFlow" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## Getting started + +First, if you want to follow along, we'll need to clone the [example project](https://github.com/dbt-labs/jaffle-sl-template). You will need access to a Snowflake, BigQuery, Databricks, or Postgres warehouse for this, for the time being. The project is our classic Jaffle Shop, a simulated chain restaurant serving [jaffles](https://en.wikipedia.org/wiki/Pie_iron) and tasty beverages. + +```shell +git clone git@github.com:dbt-labs/jaffle-sl-template.git +cd path/to/project +``` + +Next, before you start writing code, you need to install MetricFlow as an extension of a dbt adapter from PyPI (dbt Core users only). The MetricFlow is compatible with Python versions 3.8 through 3.11. + +We'll use pip to install MetricFlow and our dbt adapter: + +```shell +# activate a virtual environment for your project, +# if you don't have a name you like to use we suggest .venv +python -m venv [virtual environment name] +source [virtual environment name]/bin/activate +# install dbt and MetricFlow +pip install "dbt-metricflow[adapter name]" +# e.g. pip install "dbt-metricflow[snowflake]" +``` + +Lastly, to get to the pre-Semantic Layer starting state, checkout the `start-here` branch. + +```shell +git checkout start-here +``` + +For more information, refer to the [MetricFlow commands](/docs/build/metricflow-commands) or a [quickstart](/guides) to get more familiar with setting up a dbt project. + +## Basic commands + +- 💻 This package will install both `dbt` and `mf` as CLIs in our virtual environment. All the regular `dbt` commands like `run`, `build`, and `test` are available. +- 🔍 A less common one that will come in handy with the Semantic Layer is `dbt parse`. This will parse your project and generate a **semantic manifest**, a representation of meaningful connections described by your project. This file gives MetricFlow a **state of the world from which to generate queries**. +- 🧰 In addition to `dbt`, you'll have access to `mf` commands like `query` and `validate-configs`, which operate based on that semantic manifest. We'll dig more into all of these as we go along. +- 🛠️ Lets start off by running a `dbt build` to get the **starting state** of our project built. diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md new file mode 100644 index 00000000000..a2dc55e37ae --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md @@ -0,0 +1,296 @@ +--- +title: "Building semantic models" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## How to build a semantic model + +A semantic model is the MetricFlow equivalent to a logical layer model (what historically has just been called a 'model' in dbt land). Just as configurations for models are defined on the `models:` YAML key, configurations for semantic models are housed under `semantic models:`. A key difference is that while a logical model consists of configuration and SQL or Python code, a **semantic model is defined purely via YAML**. Rather than encoding a specific dataset, a **semantic model describes relationships** that let your end users select and refine their own datasets reliably. + +- ⚙️ Semantic models are **comprised of three components**: + - 🫂 **entities**: these describe the **relationships** between various semantic models (think ids) + - 🪣 **dimensions**: these are the columns you want to **slice, dice, group, and filter by** (think timestamps, categories, booleans). + - 📏 **measures**: these are the **quantitative values you want to aggregate** +- 📚 We define **columns as being an entity, dimension, or measure**. + +:::tip +**File per model**. Given the interdependence of logical and semantic models, and semantic models and metrics, we've updated our best practice recommendation to a one YAML file per model approach if you're using the Semantic Layer. This houses everything related to a model in one place and preserves unique file names for quickly getting to the code you want. +::: + +## Defining orders + +- 🥪 The semantic model we're going to define is _orders_. +- 📗 We define it as a **YAML dictionary in the semantic models list**. +- 📑 It will have a **name, entities list, dimensions list, and measures list**. +- ⏬ We recommend defining them **in this order consistently** as a style best practice. + +```YAML +semantic_models: + - name: orders + entities: + ... + dimensions: + ... + measures: + ... +``` + +- Next we'll point to the corresponding logical model by supplying a [`ref`](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) in the `model:` property, and a `description` for documentation. + +```YAML +semantic_models: + - name: orders + description: | + Model containing order data. The grain of the table is the order id. + model: ref('stg_orders') + entities: + ... + dimensions: + ... + measures: + ... +``` + +## Establishing our entities + +- 🫂 Entities are the **objects and concepts** in our data that _have_ dimensions and measures. You can think of them as the **nouns** of our project, the **spines** of our queries that we may want to aggregate by, or simply the **join keys**. +- 🔀 Entities help MetricFlow understand **how various semantic models relate to one another**. +- ⛓️ Unlike many other semantic layers, in MetricFlow **we do not need to describe joins explicitly**, instead the **relationships are implicitly described by entities**. +- 1️⃣ Each semantic model should have **one primary entity** defined for itself, and **any number of foreign entities** for other semantic models it may join to. +- 🫂 Entities require a **name and type** + - 🔑 Types available are **primary**, **foreign**, **unique** or **natural** — we'll be focused on the first two for now, but you can [read more about unique and natural keys](https://docs.getdbt.com/docs/build/entities#entity-types). + +### Entities in action + +If we look at the staging model for orders, we see that it has 3 id columns, so we'll need three entities. + +```SQL +renamed as ( + + select + + ---------- ids + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- properties + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps + ordered_at + + from source +``` + +- 👉 We add them with a **`name`, `type`, and optional `expr`** (expression). The expression can be any valid SQL expression on your platform. +- 📛 If you **don't add an expression**, MetricFlow will **assume the name is equal to the column name** in the underlying logical model. +- 👍 Our best practices pattern is to, whenever possible, provide a `name` that is the singular form of the subject or grain of the table, and use `expr` to specify the precise column name (with `_id` etc). This will let us write **more readable metrics** on top of these semantic models. + +```YAML +semantic_models: + - name: orders + ... + entities: + # we use the column for the name here because order is a reserved word in SQL + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + ... + measures: + ... + +``` + +## Defining our dimensions + +- 🧮 Dimensions are the columns that we want to **filter and group by**, **the adjectives of our project**. They come in three types: + - **categorical** + - **time** + - slowly changing dimensions — [these are covered in the documentation](https://docs.getdbt.com/docs/build/dimensions#scd-type-ii), and a little more complex. To focus on building your mental models of MetricFlow's fundamentals, we won't be using SCDs in this guide. +- ➕ We're **not limited to existing columns**, we can use the `expr` property to add simple computations in our dimensions. +- 📛 Categorical dimensions are the simplest, they simply require a `name` and `type` (type being categorical). **If the `name` property matches the name of the dimension column**, that's it, you're done. If you want or need to use a `name` other than the column name, or do some filtering or computation, **you can supply an optional `expr` property** to evaluate for the dimension. + +### Dimensions in action + +- 👀 Let's look at our staging model again and see what fields we have available. + +```SQL +select + + ---------- ids -> entities + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- numerics -> measures + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps -> dimensions + ordered_at + +from source +``` + +- ⏰ For now the only dimension to add is a **time dimension**. +- 🕰️ At least one **primary time dimension** is **required** for any semantic models that **have measures**. +- 1️⃣ We denote this with the `is_primary` property, or if there is only a one-time dimension supplied it is primary by default. Below we only have `ordered_at` as a timestamp so we don't need to specify anything except the maximum granularity we're bucketing to (in this case, day). + +```YAML +dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using [BigQuery](/docs/build/dimensions#time) + type: time + type_params: + time_granularity: day +``` + +:::tip +**Dimensional models**. You may have some models that do not contain measures, just dimensional data that enriches other facts. That's totally fine, a semantic model does not require dimensions or measures, it just needs a primary entity, and if you do have measures, a primary time dimension. + +We'll discuss an alternate situation, dimensional tables that have static numeric values like supply costs or tax rates but no time dimensions, later in the Guide. +::: + +- 🔢 We can also **make a dimension out of a numeric column** that would typically be a measure. +- 🪣 Using `expr` we can **create buckets of values that we label** for our dimension. We'll add one of these in for labeling 'large orders' as any order totals over $50. + +```YAML +... +dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end +... +``` + +## Making our measures + +- 📏 Measures are the final component of a semantic model. They describe the **numeric values that we want to aggregate**. +- 🧱 Measures form **the building blocks of metrics**, with entities and dimensions helping us combine, group, and filter those metrics correctly. +- 🏃 You can think of them as something like the **verbs of a semantic model**. + +### Measures in action + +- 👀 Let's look at **our staging model** one last time and see what **fields we want to measure**. + +```SQL +select + + ---------- ids -> entities + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- numerics -> measures + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps -> dimensions + ordered_at + +from source +``` + +- ➕ Here `order_total` and `tax paid` are the **columns we want as measures**. +- 📝 We can describe them via the code below, specifying a **name, description, aggregation, and expression**. +- 👍 As before MetricFlow we default to the **name being the name of a column when no expression is supplied**. +- 🧮 [Many different aggregations](https://docs.getdbt.com/docs/build/measures#aggregation) are available to us. Here we just want sums. + +```YAML +measures: + - name: order_total + description: The total amount for each order including taxes. + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + +- 🆕 We can also **create new measures using expressions**, for instance adding a count of individual orders as below. + +```YAML + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum +``` + +## Validating configs + +Our completed code should look like this, our first semantic model! + +```orders +semantic_models: + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + + model: ref('stg_orders') + + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end + + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + +- 🦺 We can check that it's a valid configuration and works with the real data our dbt project is generating by using the `mf validate-configs` command. This will: + 1. **Parse the semantic manifest** our configuration describes out of the dbt project. + 2. Validate the **internal semantics** of the manifest as described by our code. + 3. Validate the **external semantics** of the manifest against your data warehouse (e.g. making sure that a column specified as a dimension exists on the proper table) + +## Review and next steps + +Let's review the basics of semantic models: + +- 🧱 Consist off **entities, dimensions, and measures**. +- 🫂 Describe the **semantics and relationships of objects** in the warehouse. +- 1️⃣ Correspond to a **single logical model** in your dbt project. + +Next up, let's use our new semantic model to **build a metric**! diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md new file mode 100644 index 00000000000..da83adbdc69 --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md @@ -0,0 +1,48 @@ +--- +title: "Building metrics" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## How to build metrics + +- 💹 We'll start with one of the most important metrics for any business: **revenue**. +- 📖 For now, our metric for revenue will be **defined as the sum of order totals excluding tax**. +- 🆕 Let's create a file called `metrics.yml` in our marts folder for now to write our first metric in. + +## Defining revenue + +- 🔢 Metrics have four basic properties: + - `name:` We'll use 'revenue' to reference this metric. + - `description:` For documentation. + - `label:` The display name for the metric in downstream tools. + - `type:` one of `simple`, `ratio`, or `derived`. +- 🎛️ Each type has different `type_params`. +- 🛠️ We'll build a **simple metric** first to get the hang of it, and move on to ratio and derived metrics later. +- 📏 Simple metrics are built on a **single measure defined as a type parameter**. +- 🔜 Defining **measures as their own distinct component** on semantic models is critical to allowing the **flexibility of more advanced metrics**, though simple metrics act mainly as **pass-through that provide filtering** and labeling options. A `create_metric` option for measures is coming in the next version of MetricFlow to **save you writing extra code** for simple metrics that make no changes to the underlying measure. + +```YAML +metrics: + - name: revenue + description: Sum of the order total. + label: Revenue + type: simple + type_params: + measure: order_total +``` + +## Query your metric + +Use [MetricFlow commands](/docs/build/metricflow-commands#metricflow) for metric validation or queries during development, and apply the following conventions based on your environment: + +- For dbt Cloud, use the `dbt sl` prefix before the command (such as, `dbt sl query`). +- For dbt Core, use the `mf` prefix (such as `mf validate-configs` or `mf query)`. + +Follow these best practices when updating your semantic layer code, using the `mf` command as an example (replace `mf` with `dbt sl` if you're using dbt Cloud): + +- It's best practice any time we're updating our semantic layer code to run `dbt parse` if using dbt Cloud or `dbt parse && mf validate-configs` if using dbt Core, to validate your configs. +- If everything passes, we can start querying this metric with `mf query`! +- `mf query` is not how you would use the tool in production, that's handled by the dbt Cloud Semantic Layer's features. It's available for testing results of various metric queries in development, exactly as we're using it now. +- Try `mf query --metrics revenue --group-by metric_time__day` and see a preview of the data come back. +- Note the structure of the above query. We select the metric(s) we want and the dimensions to group them by — we use dunders (double underscores e.g.`metric_time__[time bucket]`) to designate time dimensions or other non-unique dimensions that need a specified entity path to resolve (e.g. if you have an orders location dimension and an employee location dimension both named 'location' you would need dunders to specify `orders__location` or `employee__location`). diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md new file mode 100644 index 00000000000..dfdba2941e9 --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md @@ -0,0 +1,242 @@ +--- +title: "Refactor an existing mart" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## A new approach + +We've covered the basics, now it's time to dig in to the fun and messy part: how do we refactor an existing mart in dbt into semantic models and metrics? + +Let's look at the differences we can observe in how we might approach this with MetricFlow supercharging dbt versus how we work without a Semantic Layer. These differences can then inform our structure. + +- 🍊 In dbt, we tend to create **highly denormalized datasets** that bring **everything you want around a certain entity or process into a single table**. +- 💜 The problem is, this **limits the dimensionality available to MetricFlow**. The more we pre-compute and 'freeze' into place, the less flexible our data is. +- 🚰 In MetricFlow, we ideally want **highly normalized**, star schema-like data that then allows MetricFlow to shine as a **denormalization engine**. +- ∞ Another way to think about this is that instead of moving down a list of requested priorities trying to pre-make as many combinations of our marts as possible — increasing lines of code and complexity — we can **let MetricFlow present every combination possible without specifically coding it**. +- 🏗️ To resolve these approaches optimally, we'll need to shift some **fundamental aspects of our modeling strategy**. + +## Refactor steps outlined + +We recommend an incremental implementation process that looks something like this: + +1. 👉 Identify **an important output** (a revenue chart on a dashboard for example, and the mart model(s) that supplies this output. +2. 🔍 Examine all the **entities that are components** of this mart (for instance, an orders mart may include customers, shipping, and product data). +3. 🛠️ **Build semantic models and metrics** for all the required components. +4. 👯 Create a **clone of the output** on top of the Semantic Layer. +5. 💻 Audit to **ensure you get accurate outputs**. +6. 💎 Use `mf list dimensions --metrics [metric_name]` to check that your refactoring is increasing dimensionality (flexibility). +7. 👉 Identify **any other outputs** that point to the mart and **move them to the Semantic Layer**. +8. ✌️ Put a **deprecation plan** in place for the mart. + +You would then **continue this process** on other outputs and marts moving down a list of **priorities**. Each model as you go along will be faster and easier as you'll **reuse many of the same components** that will already have been semantically modeled. + +## Let's make a `revenue` metric + +So far we've been working in new pointing at a staging model to simplify things as we build new mental models for MetricFlow. In reality, unless you're implementing MetricFlow in a green-field dbt project, you probably are going to have some refactoring to do. So let's get into that in detail. + +1. 📚 Per the above steps, we've identified our target, now we need to identify all the components we need, these will be all the 'import' CTEs at the top our mart. Let's look at `orders` and `order_items`, the likely models to generate revenue, we see we'll need: `orders`, `order_items`, `products`, `locations`, and `supplies`. +2. 🗺️ We'll next make semantic models for all of these. Let's walk through a straightforward conversion first with `locations`. +3. ⛓️ We'll want to first decide if we need to do any joining to get this into the shape we want for our semantic model. The biggest determinants of this are two factors: + - 📏 Does this semantic model **contain measures**? + - 🕥 Does this semantic model have a **primary timestamp**? + - 🫂 If a semantic model **has measures but no timestamp** (for example, supplies in the example project, which has static costs of supplies), you'll likely want to **sacrifice some normalization and join it on to another model** that has a primary timestamp to allow for metric aggregation. +4. 🔄 If we _don't_ need any joins, we'll just go straight to the staging model for our semantic model's `ref`. Locations does have a `tax_rate` measure, but it also has an `ordered_at` timestamp, so we can go **straight to the staging model** here. +5. 🥇 We specify our **primary entity** (based on `location_id`), dimensions (one categorical, `location_name`, and one **primary time dimension** `opened_at`), and lastly our measures, in this case just `average_tax_rate`. + + ```YAML + semantic_models: + - name: locations + description: | + Location dimension table. The grain of the table is one row per location. + model: ref('stg_locations') + entities: + - name: location + type: primary + expr: location_id + dimensions: + - name: location_name + type: categorical + - name: date_trunc('day', opened_at) + type: time + type_params: + time_granularity: day + measures: + - name: average_tax_rate + description: Average tax rate. + expr: tax_rate + agg: avg + ``` + +## Semantic and logical interaction + +Now, let's tackle a thornier situation. Products and supplies both have dimensions and measures but no time dimension. Products has a one-to-one relationship with `order_items`, enriching that table, which is itself just a mapping table of products to orders. Additionally, products have a one-to-many relationship with supplies. The high-level ERD looks like the diagram below. + + + +So to calculate, for instance, the cost of ingredients and supplies for a given order, we'll need to do some joining and aggregating, but again we **lack a time dimension for products and supplies**. This is the signal to us that we'll **need to build a logical mart** and point our semantic model at that. + +:::tip +**dbt 🧡 MetricFlow.** This is where integrating your semantic definitions into your dbt project really starts to pay dividends. The interaction between the logical and semantic layers is so dynamic, you either need to house them in one codebase or facilitate a lot of cross-project communication and dependency. +::: + +1. 🎯 Let's aim at, to start, building a table at the `order_items` grain. We can aggregate supply costs up, map over the fields we want from products, such as price, and bring the `ordered_at` timestamp we need over from the orders table. We'll write the following code in `models/marts/order_items.sql`. + + ```SQL + {{ + config( + materialized = 'table', + ) + }} + + with + + order_items as ( + + select * from {{ ref('stg_order_items') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders')}} + + ), + + products as ( + + select * from {{ ref('stg_products') }} + + ), + + supplies as ( + + select * from {{ ref('stg_supplies') }} + + ), + + order_supplies_summary as ( + + select + product_id, + sum(supply_cost) as supply_cost + + from supplies + + group by 1 + ), + + joined as ( + + select + order_items.*, + products.product_price, + order_supplies_summary.supply_cost, + products.is_food_item, + products.is_drink_item, + orders.ordered_at + + from order_items + + left join orders on order_items.order_id = orders.order_id + + left join products on order_items.product_id = products.product_id + + left join order_supplies_summary on order_items.product_id = order_supplies_summary.product_id + + ) + + select * from joined + ``` + +2. 🏗️ Now we've got a table that looks more like what we want to feed into MetricFlow. Next, we'll **build a semantic model on top of this new mart** in `models/marts/order_items.yml`. Again, we'll identify our **entities, then dimensions, then measures**. + + ```YAML + semantic_models: + #The name of the semantic model. + - name: order_items + defaults: + agg_time_dimension: ordered_at + description: | + Items contatined in each order. The grain of the table is one row per order item. + model: ref('order_items') + entities: + - name: order_item + type: primary + expr: order_item_id + - name: order_id + type: foreign + expr: order_id + - name: product + type: foreign + expr: product_id + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + type: time + type_params: + time_granularity: day + - name: is_food_item + type: categorical + - name: is_drink_item + type: categorical + measures: + - name: revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: product_price + - name: food_revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: case when is_food_item = 1 then product_price else 0 end + - name: drink_revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: case when is_drink_item = 1 then product_price else 0 end + - name: median_revenue + description: The median revenue generated for each order item. + agg: median + expr: product_price + ``` + +3. 📏 Finally, Let's **build a simple revenue metric** on top of our semantic model now. + + ```YAML + metrics: + - name: revenue + description: Sum of the product revenue for each order item. Excludes tax. + type: simple + label: Revenue + type_params: + measure: revenue + ``` + +## Checking our work + +- 🔍 We always will start our **auditing** with a `dbt parse && mf validate-configs` to **ensure our code works** before we examine its output. +- 👯 If we're working there, we'll move to trying out an `mf query` that **replicates the logic of the output** we're trying to refactor. +- 💸 For our example we want to **audit monthly revenue**, to do that we'd run the query below. You can [read more about the MetricFlow CLI](https://docs.getdbt.com/docs/build/metricflow-cli). + +### Example query + +```shell +mf query --metrics revenue --group-by metric_time__month +``` + +### Example query results + +```shell +✔ Success 🦄 - query completed after 1.02 seconds +| METRIC_TIME__MONTH | REVENUE | +|:---------------------|----------:| +| 2016-09-01 00:00:00 | 17032.00 | +| 2016-10-01 00:00:00 | 20684.00 | +| 2016-11-01 00:00:00 | 26338.00 | +| 2016-12-01 00:00:00 | 10685.00 | +``` + +- Try introducing some other dimensions from the semantic models into the `group-by` arguments to get a feel for this command. + +## An alternate approach + +If you **don't have capacity to refactor** some of your marts, they can **still benefit from the Semantic Layer**. The above process is about **maximizing dimensionality** for the long term. In the short term, making your **marts as-is available to MetricFlow** unlocks greatly increased functionality. For an example of this quicker approach check out the `customers` SQL and YAML files on the `main` branch. This displays a **typical denormalized dbt mart** being hooked into MetricFlow. diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md new file mode 100644 index 00000000000..fe7438b5800 --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md @@ -0,0 +1,79 @@ +--- +title: "More advanced metrics" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## More advanced metric types + +We're not limited to just passing measures through to our metrics, we can also _combine_ measures to model more advanced metrics. + +- 🍊 **Ratio** metrics are, as the name implies, about **comparing two metrics as a numerator and a denominator** to form a new metric, for instance the percentage of order items that are food items instead of drinks. +- 🧱 **Derived** metrics are when we want to **write an expression** that calculates a metric **using multiple metrics**. A classic example here is our gross profit calculated by subtracting costs from revenue. +- ➕ **Cumulative** metrics calculate all of a **measure over a given window**, such as the past week, or if no window is supplied, the all-time total of that measure. + +## Ratio metrics + +- 🔢 We need to establish one measure that will be our **numerator**, and one that will be our **denominator**. +- 🥪 Let's calculate the **percentage** of our Jaffle Shop revenue that **comes from food items**. +- 💰 We already have our denominator, revenue, but we'll want to **make a new metric for our numerator** called `food_revenue`. + +```YAML + - name: food_revenue + description: The revenue from food in each order. + label: Food Revenue + type: simple + type_params: + measure: revenue + filter: | + {{ Dimension('order__is_food_order') }} = true +``` + +- 📝 Now we can set up our ratio metric. + +```YAML +- name: food_revenue_pct + description: The % of order revenue from food. + label: Food Revenue % + type: ratio + type_params: + numerator: food_revenue + denominator: revenue +``` + +## Derived metrics + +- 🆙 Now let's really have some fun. One of the most important metrics for any business is not just revenue, but _revenue growth_. Let's use a derived metric to build month-over-month revenue. +- ⚙️ A derived metric has a couple key components: + - 📚 A list of metrics to build on. These can be manipulated and filtered in various way, here we'll use the `offset_window` property to lag by a month. + - 🧮 An expression that performs a calculation with these metrics. +- With these parts we can assemble complex logic that would otherwise need to be 'frozen' in logical models. + +```YAML +- name: revenue_growth_mom + description: "Percentage growth of revenue compared to 1 month ago. Excluded tax" + type: derived + label: Revenue Growth % M/M + type_params: + expr: (current_revenue - revenue_prev_month) * 100 / revenue_prev_month + metrics: + - name: revenue + alias: current_revenue + - name: revenue + offset_window: 1 month + alias: revenue_prev_month +``` + +## Cumulative metrics + +- ➕ Lastly, lets build a **cumulative metric**. In keeping with our theme of business priorities, let's continue with revenue and build an **all-time revenue metric** for any given time window. +- 🪟 All we need to do is indicate the type is `cumulative` and not supply a `window` in the `type_params`, which indicates we want cumulative for the entire time period our end users select. + +```YAML +- name: cumulative_revenue + description: The cumulative revenue for all orders. + label: Cumulative Revenue (All Time) + type: cumulative + type_params: + measure: revenue +``` diff --git a/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md new file mode 100644 index 00000000000..a1062721177 --- /dev/null +++ b/website/docs/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md @@ -0,0 +1,34 @@ +--- +title: "Best practices" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## Putting it all together + +- 📊 We've **created semantic models and metrics** for basic coverage of a key business area. +- 🔁 In doing so we've **refactored a 'static' mart** into a dynamic, flexible new life in the Semantic Layer. +- 🗺️ We encourage you to **explore the `main` branch** of the [example project repo](https://github.com/dbt-labs/jaffle-sl-template) to see even more metrics and semantic models in action within a project fully ported to the Semantic Layer. + +## Best practices + +- ✅ **Prefer normalization** when possible to allow MetricFlow to denormalize dynamically for end users. +- ✅ Use **marts to denormalize** when needed, for instance grouping tables together into richer components, or getting measures on dimensional tables attached to a table with a time spine. +- ✅ When source data is **well normalized** you can **build semantic models on top of staging models**. +- ✅ **Prefer** computing values in **measures and metrics** when possible as opposed to in fixed marts. +- ❌ **Don't directly refactor the code you have in production**, build in parallel so you can audit the Semantic Layer output and deprecate old marts gracefully. + +## Key commands + +- 🔑 Use `dbt parse && mf validate-configs` to generate a semantic manifest and ensure it works with your data. +- 🔑 Use `mf list dimensions --metrics [metric name]` to check that you're increasing dimensionality as you progress. +- 🔑 Use `mf query [query options]` to preview the output from your metrics as you develop. + +## Next steps + +- 🗺️ Map out a clear plan for your dbt project to **incrementally adopt the Semantic Layer**. +- 🤗 Get involved in the community and ask questions, **help craft best practices**, and share your progress in building a dbt Semantic Layer. + +The dbt Semantic Layer is the biggest paradigm shift thus far in the young practice of analytics engineering. It's ready to provide value right away, but is most impactful if you move your project towards increasing normalization, and allow MetricFlow to do the denormalization for you with maximum dimensionality. + +We will be releasing more resources soon covering implementation of the Semantic Layer in dbt Cloud with various integrated BI tools. This is just the beginning, hopefully this guide has given you a path forward for building your data platform in this new era. diff --git a/website/docs/best-practices/how-we-mesh/mesh-1-intro.md b/website/docs/best-practices/how-we-mesh/mesh-1-intro.md new file mode 100644 index 00000000000..ba1660a8d82 --- /dev/null +++ b/website/docs/best-practices/how-we-mesh/mesh-1-intro.md @@ -0,0 +1,39 @@ +--- +title: "Intro to dbt Mesh" +description: Getting started with dbt Mesh patterns +hoverSnippet: Learn how to get started with dbt Mesh +--- + +## What is dbt Mesh? + +Organizations of all sizes rely upon dbt to manage their data transformations, from small startups to large enterprises. At scale, it can be challenging to coordinate all the organizational and technical requirements demanded by your stakeholders within the scope of a single dbt project. To date, there also hasn't been a first-class way to effectively manage the dependencies, governance, and workflows between multiple dbt projects. + +Regardless of your organization's size and complexity, dbt should empower data teams to work independently and collaboratively; sharing data, code, and best practices without sacrificing security or autonomy. dbt Mesh provides the tooling for teams to finally achieve this. + +dbt Mesh is not a single product: it is a pattern enabled by a convergence of several features in dbt: + +- **[Cross-project references](/docs/collaborate/govern/project-dependencies#how-to-use-ref)** - this is the foundational feature that enables the multi-project deployments. `{{ ref() }}`s now work across dbt Cloud projects on Enterprise plans. +- **[dbt Explorer](/docs/collaborate/explore-projects)** - dbt Cloud's metadata-powered documentation platform, complete with full, cross-project lineage. +- **Governance** - dbt's new governance features allow you to manage access to your dbt models both within and across projects. + - **[Groups](/docs/collaborate/govern/model-access#groups)** - groups allow you to assign models to subsets within a project. + - **[Access](/docs/collaborate/govern/model-access#access-modifiers)** - access configs allow you to control who can reference models. +- **[Model Versions](/docs/collaborate/govern/model-versions)** - when coordinating across projects and teams, we recommend treating your data models as stable APIs. Model versioning is the mechanism to allow graceful adoption and deprecation of models as they evolve. +- **[Model Contracts](/docs/collaborate/govern/model-contracts)** - data contracts set explicit expectations on the shape of the data to ensure data changes upstream of dbt or within a project's logic don't break downstream consumers' data products. + +## Who is dbt Mesh for? + +The multi-project architecture helps organizations with mature, complex transformation workflows in dbt increase the flexibility and performance of their dbt projects. If you're already using dbt and your project has started to experience any of the following, you're likely ready to start exploring this paradigm: + +- The **number of models** in your project is degrading performance and slowing down development. +- Teams have developed **separate workflows** and need to decouple development from each other. +- **Security and governance** requirements are increasing and would benefit from increased isolation. + +dbt Cloud is designed to coordinate the features above and simplify the complexity to solve for these problems. + +If you're just starting your dbt journey, don't worry about building a multi-project architecture right away. You can _incrementally_ adopt the features in this guide as you scale. The collection of features work effectively as independent tools. Familiarizing yourself with the tooling and features that make up a multi-project architecture, and how they can apply to your organization will help you make better decisions as you grow. + +## Learning goals + +- Understand the **purpose and tradeoffs** of building a multi-project architecture. +- Develop an intuition for various **dbt Mesh patterns** and how to design a multi-project architecture for your organization. +- Establish recommended steps to **incrementally adopt** these patterns in your dbt implementation. diff --git a/website/docs/best-practices/how-we-mesh/mesh-2-structures.md b/website/docs/best-practices/how-we-mesh/mesh-2-structures.md new file mode 100644 index 00000000000..9ab633c50ad --- /dev/null +++ b/website/docs/best-practices/how-we-mesh/mesh-2-structures.md @@ -0,0 +1,56 @@ +--- +title: Deciding how to structure your dbt Mesh +description: Getting started with dbt Mesh patterns +hoverSnippet: Learn how to get started with dbt Mesh +--- +## Exploring mesh patterns + +When adopting a multi-project architecture, where do you draw the lines between projects? + +How should you organize data workflows in a world where instead of having a single dbt DAG, you have multiple projects speaking to each other, each comprised of their own DAG? + +Adopting the dbt Mesh pattern is not a one-size-fits-all process. In fact, it's the opposite! It's about customizing your project structure to fit _your_ team and _your_ data. Now you can mold your organizational knowledge graph to your organizational people graph, bringing people and data closer together rather than compromising one for the other. + +While there is not a single best way to implement this pattern, there are some common decision points that will be helpful for you to consider. + +At a high level, you’ll need to decide: + +- Where to draw the lines between your dbt Projects -- i.e. how do you determine where to split your DAG and which models go in which project? +- How to manage your code -- do you want multiple dbt Projects living in the same repository (mono-repo) or do you want to have multiple repos with one repo per project? + +### Cycle detection + +Like resource dependencies, project dependencies are acyclic, meaning they only move in one direction. This prevents `ref` cycles (or loops), which lead to issues with your data workflows. For example, if project B depends on project A, a new model in project A could not import and use a public model from project B. Refer to [Project dependencies](/docs/collaborate/govern/project-dependencies#how-to-use-ref) for more information. + +## Define your project interfaces by splitting your DAG + +The first (and perhaps most difficult!) decision when migrating to a multi-project architecture is deciding where to draw the line in your DAG to define the interfaces between your projects. Let's explore some language for discussing the design of these patterns. + +### Vertical splits + +Vertical splits separate out layers of transformation in DAG order. Let's look at some examples. + +- **Splitting up staging and mart layers** to create a more tightly-controlled, shared set of components that other projects build on but can't edit. +- **Isolating earlier models for security and governance requirements** to separate out and mask PII data so that downstream consumers can't access it is a common use case for a vertical split. +- **Protecting complex or expensive data** to isolate large or complex models that are expensive to run so that they are safe from accidental selection, independently deployable, and easier to debug when they have issues. + +### Horizontal splits + +Horizontal splits separate your DAG based on source or domain. These splits are often based around the shape and size of the data and how it's used. Let's consider some possibilities for horizontal splitting. + +- **Team consumption patterns.** For example, splitting out the marketing team's data flow into a separate project. +- **Data from different sources.** For example, clickstream event data and transactional ecommerce data may need to be modeled independently of each other. +- **Team workflows.** For example, if two embedded groups operate at different paces, you may want to split the projects up so they can move independently. + +### Combining these strategies + +- **These are not either/or techniques**. You should consider both types of splits, and combine them in any way that makes sense for your organization. +- **Pick one type of split and focus on that first**. If you have a hub-and-spoke team topology for example, handle breaking out the central platform project before you split the remainder into domains. Then if you need to break those domains up horizontally you can focus on that after the fact. +- **DRY applies to underlying data, not just code.** Regardless of your strategy, you should not be sourcing the same rows and columns into multiple nodes. When working within a mesh pattern it becomes increasingly important that we don't duplicate logic or data. + +## Determine your git strategy + +A multi-project architecture can exist in a single repo (monorepo) or as multiple projects, with each one being in their own repository (multi-repo). + +- If you're a **smaller team** looking primarily to speed up and simplify development, a **monorepo** is likely the right choice, but can become unwieldy as the number of projects, models and contributors grow. +- If you’re a **larger team with multiple groups**, and need to decouple projects for security and enablement of different development styles and rhythms, a **multi-repo setup** is your best bet. diff --git a/website/docs/best-practices/how-we-mesh/mesh-3-implementation.md b/website/docs/best-practices/how-we-mesh/mesh-3-implementation.md new file mode 100644 index 00000000000..65ed5d7935b --- /dev/null +++ b/website/docs/best-practices/how-we-mesh/mesh-3-implementation.md @@ -0,0 +1,130 @@ +--- +title: "Implementing your mesh plan" +description: Getting started with dbt Mesh patterns +hoverSnippet: Learn how to get started with dbt Mesh +--- + +As mentioned before, the key decision in migrating to a multi-project architecture is understanding how your project is already being grouped, built, and deployed. We can use this information to inform our decision to split our project apart. + +- **Examine your jobs** - which sets of models are most often built together? +- **Look at your lineage graph** - how are models connected? +- **Look at your selectors** defined in `selectors.yml` - how do people already define resource groups? +- **Talk to teams** about what sort of separation naturally exists right now. + - Are there various domains people are focused on? + - Are there various sizes, shapes, and sources of data that get handled separately (such as click event data)? + - Are there people focused on separate levels of transformation, such as landing and staging data or building marts? + +## Add groups and access + +Once you have a sense of some initial groupings, you can first implement **group and access permissions** within a single project. + +- First you can create a [group](/docs/build/groups) to define the owner of a set of models. + +```yml +# in models/__groups.yml + +groups: + - name: marketing + owner: + name: Ben Jaffleck + email: ben.jaffleck@jaffleshop.com +``` + +- Then, we can add models to that group using the `group:` key in the model's YAML entry. + +```yml +# in models/marketing/__models.yml + +models: + - name: fct_marketing_model + group: marketing + - name: stg_marketing_model + group: marketing +``` + +- Once you've added models to the group, you can **add [access](/docs/collaborate/govern/model-access) settings to the models** based on their connections between groups, *opting for the most private access that will maintain current functionality*. This means that any model that has *only* relationships to other models in the same group should be `private` , and any model that has cross-group relationships, or is a terminal node in the group DAG should be `protected` so that other parts of the DAG can continue to reference it. + +```yml +# in models/marketing/__models.yml + +models: + - name: fct_marketing_model + group: marketing + access: protected + - name: stg_marketing_model + group: marketing + access: private +``` + +- **Validate these groups by incrementally migrating your jobs** to execute these groups specifically via selection syntax. We would recommend doing this in parallel to your production jobs until you’re sure about them. This will help you feel out if you’ve drawn the lines in the right place. +- If you find yourself **consistently making changes across multiple groups** when you update logic, that’s a sign that **you may want to rethink your groups**. + +## Split your projects + +1. **Move your grouped models into a subfolder**. This will include any model in the selected group, it's associated YAML entry, as well as its parent or child resources as appropriate depending on where this group sits in your DAG. + 1. Note that just like in your dbt project, circular refereneces are not allowed! Project B cannot have parents and children in Project A, for example. +2. **Create a new `dbt_project.yml` file** in the subdirectory. +3. **Copy any macros** used by the resources you moved. +4. **Create a new `packages.yml` file** in your subdirectory with the packages that are used by the resources you moved. +5. **Update `{{ ref }}` functions** — For any model that has a cross-project dependency (this may be in the files you moved, or in the files that remain in your project): + 1. Update the `{{ ref() }}` function to have two arguments, where the first is the name of the source project and the second is the name of the model: e.g. `{{ ref('jaffle_shop', 'my_upstream_model') }}` + 2. Update the upstream, cross-project parents’ `access` configs to `public` , ensuring any project can safely `{{ ref() }}` those models. + 3. We *highly* recommend adding a [model contract](/docs/collaborate/govern/model-contracts) to the upstream models to ensure the data shape is consistent and reliable for your downstream consumers. +6. **Create a `dependencies.yml` file** ([docs](/docs/collaborate/govern/project-dependencies)) for the downstream project, declaring the upstream project as a dependency. + +```yml + +# in dependencies.yml +projects: + - name: jaffle_shop +``` + +### Best practices + +- When you’ve **confirmed the right groups**, it's time to split your projects. + - **Do *one* group at a time**! + - **Do *not* refactor as you migrate**, however tempting that may be. Focus on getting 1-to-1 parity and log any issues you find in doing the migration for later. Once you’ve fully migrated the project then you can start optimizing it for its new life as part of your mesh. +- Start by splitting your project within the same repository for full git tracking and easy reversion if you need to start from scratch. + + +## Connecting existing projects + +Some organizations may already be coordinating across multiple dbt projects. Most often this is via: + +1. Installing parent projects as dbt packages +2. Using `{{ source() }}` functions to read the outputs of a parent project as inputs to a child project. + +This has a few drawbacks: + +1. If using packages, each project has to include *all* resources from *all* projects in its manifest, slowing down dbt and the development cycle. +2. If using sources, there are breakages in the lineage, as there's no real connection between the parent and child projects. + +The migration steps here are much simpler than splitting up a monolith! + +1. If using the `package` method: + 1. In the parent project: + 1. mark all models being referenced downstream as `public` and add a model contract. + 2. In the child project: + 1. Remove the package entry from `packages.yml` + 2. Add the upstream project to your `dependencies.yml` + 3. Update the `{{ ref() }}` functions to models from the upstream project to include the project name argument. +1. If using `source` method: + 1. In the parent project: + 1. mark all models being imported downstream as `public` and add a model contract. + 2. In the child project: + 1. Add the upstream project to your `dependencies.yml` + 2. Replace the `{{ source() }}` functions with cross project `{{ ref() }}` functions. + 3. Remove the unnecessary `source` definitions. + +## Additional Resources +### Our example projects + +We've provided a set of example projects you can use to explore the topics covered here. We've split our [Jaffle Shop](https://github.com/dbt-labs/jaffle-shop) project into 3 separate projects in a multi-repo dbt Mesh. Note that you'll need to leverage dbt Cloud to use multi-project architecture, as cross-project references are powered via dbt Cloud's APIs. + +- **[Platform](https://github.com/dbt-labs/jaffle-shop-mesh-platform)** - containing our centralized staging models. +- **[Marketing](https://github.com/dbt-labs/jaffle-shop-mesh-marketing)** - containing our marketing marts. +- **[Finance](https://github.com/dbt-labs/jaffle-shop-mesh-finance)** - containing our finance marts. + +### dbt-meshify + +We recommend using the `dbt-meshify` [command line tool]() to help you do this. This comes with CLI operations to automate most of the above steps. diff --git a/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md b/website/docs/best-practices/how-we-structure/1-guide-overview.md similarity index 90% rename from website/docs/guides/best-practices/how-we-structure/1-guide-overview.md rename to website/docs/best-practices/how-we-structure/1-guide-overview.md index 1bbb628b73d..d1e78231e57 100644 --- a/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md +++ b/website/docs/best-practices/how-we-structure/1-guide-overview.md @@ -14,9 +14,9 @@ Building a great dbt project is an inherently collaborative endeavor, bringing t Famously, Steve Jobs [wore the same outfit everyday](https://images.squarespace-cdn.com/content/v1/5453c539e4b02ab5398ffc8f/1580381503218-E56FQDNFL1P4OBLQWHWW/ke17ZwdGBToddI8pDm48kJKedFpub2aPqa33K4gNUDwUqsxRUqqbr1mOJYKfIPR7LoDQ9mXPOjoJoqy81S2I8N_N4V1vUb5AoIIIbLZhVYxCRW4BPu10St3TBAUQYVKcxb5ZTIyC_D49_DDQq2Sj8YVGtM7O1i4h5tvKa2lazN4nGUQWMS_WcPM-ztWbVr-c/steve_jobs_outfit.jpg) to reduce decision fatigue. You can think of this guide similarly, as a black turtleneck and New Balance sneakers for your company’s dbt project. A dbt project’s power outfit, or more accurately its structure, is composed not of fabric but of files, folders, naming conventions, and programming patterns. How you label things, group them, split them up, or bring them together — the system you use to organize the [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) encoded in your dbt project — this is your project’s structure. -This guide is just a starting point. You may decide that you prefer Birkenstocks or a purple hoodie for your project over Jobs-ian minimalism. That's fine. What's important is that you think through the reasoning for those changes in your organization, explicitly declare them in a thorough, accessible way for all contributors, and above all *stay consistent*. +This guide is just a starting point. You may decide that you prefer Birkenstocks or a purple hoodie for your project over Jobs-ian minimalism. That's fine. What's important is that you think through the reasoning for those changes in your organization, explicitly declare them in a thorough, accessible way for all contributors, and above all _stay consistent_. -One foundational principle that applies to all dbt projects though, is the need to establish a cohesive arc moving data from *source-conformed* to *business-conformed*. Source-conformed data is shaped by external systems out of our control, while business-conformed data is shaped by the needs, concepts, and definitions we create. No matter what patterns or conventions you define within your project, this process remains the essential purpose of the transformation layer, and dbt as your tool within it. This guide is an update to a seminal analytics engineering [post of the same name](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) by the great Claire Carroll, and while some of the details have changed over time (as anticipated in that post) this fundamental trajectory holds true. Moving forward, this guide will be iteratively updated as new tools expand our viewpoints, new experiences sharpen our vision, and new voices strengthen our perspectives, but always in service of that aim. +One foundational principle that applies to all dbt projects though, is the need to establish a cohesive arc moving data from _source-conformed_ to _business-conformed_. Source-conformed data is shaped by external systems out of our control, while business-conformed data is shaped by the needs, concepts, and definitions we create. No matter what patterns or conventions you define within your project, this process remains the essential purpose of the transformation layer, and dbt as your tool within it. This guide is an update to a seminal analytics engineering [post of the same name](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) by the great Claire Carroll, and while some of the details have changed over time (as anticipated in that post) this fundamental trajectory holds true. Moving forward, this guide will be iteratively updated as new tools expand our viewpoints, new experiences sharpen our vision, and new voices strengthen our perspectives, but always in service of that aim. ### Learning goals @@ -24,7 +24,7 @@ This guide has three main goals: - Thoroughly cover our most up-to-date recommendations on how to structure typical dbt projects - Illustrate these recommendations with comprehensive examples -- At each stage, explain *why* we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs +- At each stage, explain _why_ we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs You should walk away from this guide with a deeper mental model of how the components of a dbt project fit together, such that purpose and principles of analytics engineering feel more clear and intuitive. @@ -33,7 +33,7 @@ By approaching our structure intentionally, we’ll gain a better understanding Our hope is that by deepening your sense of the connections between these patterns and the principles they flow from, you'll be able to translate them to fit your specific needs and craft customized documentation for your team to act on. :::info Example project. -This guide walks through our recommendations using a very simple dbt project — similar to the one used for the Getting Started guide and many other demos — from a fictional company called the Jaffle Shop. You can read more about [jaffles](https://en.wiktionary.org/wiki/jaffle) if you want (they *are* a real thing), but that context isn’t important to understand the structure. We encourage you to follow along, try things out, make changes, and take notes on what works or doesn't work for you along the way. +This guide walks through our recommendations using a very simple dbt project — similar to the one used for the Getting Started guide and many other demos — from a fictional company called the Jaffle Shop. You can read more about [jaffles](https://en.wiktionary.org/wiki/jaffle) if you want (they _are_ a real thing), but that context isn’t important to understand the structure. We encourage you to follow along, try things out, make changes, and take notes on what works or doesn't work for you along the way. ::: We'll get a deeper sense of our project as we move through the guide, but for now we just need to know that the Jaffle Shop is a restaurant selling jaffles that has two main data sources: @@ -46,17 +46,17 @@ We'll get a deeper sense of our project as we move through the guide, but for no We'll walk through our topics in the same order that our data would move through transformation: 1. Dig into how we structure the files, folders, and models for our three primary layers in the `models` directory, which build on each other: - 1. **Staging** — creating our atoms, our initial modular building blocks, from source data - 2. **Intermediate** — stacking layers of logic with clear and specific purposes to prepare our staging models to join into the entities we want - 3. **Marts** — bringing together our modular pieces into a wide, rich vision of the entities our organization cares about + 1. **Staging** — creating our atoms, our initial modular building blocks, from source data + 2. **Intermediate** — stacking layers of logic with clear and specific purposes to prepare our staging models to join into the entities we want + 3. **Marts** — bringing together our modular pieces into a wide, rich vision of the entities our organization cares about 2. Explore how these layers fit into the rest of the project: - 1. Review the overall structure comprehensively - 2. Expand on YAML configuration in-depth - 3. Discuss how to use the other folders in a dbt project: `tests`, `seeds`, and `analyses` + 1. Review the overall structure comprehensively + 2. Expand on YAML configuration in-depth + 3. Discuss how to use the other folders in a dbt project: `tests`, `seeds`, and `analyses` Below is the complete file tree of the project we’ll be working through. Don’t worry if this looks like a lot of information to take in at once - this is just to give you the full vision of what we’re building towards. We’ll focus in on each of the sections one by one as we break down the project’s structure. -```markdown +```shell jaffle_shop ├── README.md ├── analyses diff --git a/website/docs/guides/best-practices/how-we-structure/2-staging.md b/website/docs/best-practices/how-we-structure/2-staging.md similarity index 86% rename from website/docs/guides/best-practices/how-we-structure/2-staging.md rename to website/docs/best-practices/how-we-structure/2-staging.md index a14c5c8992b..8eb91ff5b7b 100644 --- a/website/docs/guides/best-practices/how-we-structure/2-staging.md +++ b/website/docs/best-practices/how-we-structure/2-staging.md @@ -12,9 +12,9 @@ We'll use an analogy for working with dbt throughout this guide: thinking modula ### Staging: Files and folders -Let's zoom into the staging directory from our `models` file tree [in the overview](/guides/best-practices/how-we-structure/1-guide-overview) and walk through what's going on here. +Let's zoom into the staging directory from our `models` file tree [in the overview](/best-practices/how-we-structure/1-guide-overview) and walk through what's going on here. -```markdown +```shell models/staging ├── jaffle_shop │ ├── _jaffle_shop__docs.md @@ -36,7 +36,7 @@ models/staging - ❌ **Subdirectories based on loader.** Some people attempt to group by how the data is loaded (Fivetran, Stitch, custom syncs), but this is too broad to be useful on a project of any real size. - ❌ **Subdirectories based on business grouping.** Another approach we recommend against is splitting up by business groupings in the staging layer, and creating subdirectories like 'marketing', 'finance', etc. A key goal of any great dbt project should be establishing a single source of truth. By breaking things up too early, we open ourselves up to creating overlap and conflicting definitions (think marketing and financing having different fundamental tables for orders). We want everybody to be building with the same set of atoms, so in our experience, starting our transformations with our staging structure reflecting the source system structures is the best level of grouping for this step. - **File names.** Creating a consistent pattern of file naming is [crucial in dbt](https://docs.getdbt.com/blog/on-the-importance-of-naming). File names must be unique and correspond to the name of the model when selected and created in the warehouse. We recommend putting as much clear information into the file name as possible, including a prefix for the layer the model exists in, important grouping information, and specific information about the entity or transformation in the model. - - ✅ `stg_[source]__[entity]s.sql` - the double underscore between source system and entity helps visually distinguish the separate parts in the case of a source name having multiple words. For instance, `google_analytics__campaigns` is always understandable, whereas to somebody unfamiliar `google_analytics_campaigns` could be `analytics_campaigns` from the `google` source system as easily as `campaigns` from the `google_analytics` source system. Think of it like an [oxford comma](https://www.youtube.com/watch?v=P_i1xk07o4g), the extra clarity is very much worth the extra punctuation. + - ✅ `stg_[source]__[entity]s.sql` - the double underscore between source system and entity helps visually distinguish the separate parts in the case of a source name having multiple words. For instance, `google_analytics__campaigns` is always understandable, whereas to somebody unfamiliar `google_analytics_campaigns` could be `analytics_campaigns` from the `google` source system as easily as `campaigns` from the `google_analytics` source system. Think of it like an [oxford comma](https://www.youtube.com/watch?v=P_i1xk07o4g), the extra clarity is very much worth the extra punctuation. - ❌ `stg_[entity].sql` - might be specific enough at first, but will break down in time. Adding the source system into the file name aids in discoverability, and allows understanding where a component model came from even if you aren't looking at the file tree. - ✅ **Plural.** SQL, and particularly SQL in dbt, should read as much like prose as we can achieve. We want to lean into the broad clarity and declarative nature of SQL when possible. As such, unless there’s a single order in your `orders` table, plural is the correct way to describe what is in a table with multiple rows. @@ -77,7 +77,7 @@ renamed as ( -- numerics amount as amount_cents, amount / 100.0 as amount, - + -- booleans case when status = 'successful' then true @@ -102,22 +102,23 @@ select * from renamed - ✅ **Type casting** - ✅ **Basic computations** (e.g. cents to dollars) - ✅ **Categorizing** (using conditional logic to group values into buckets or booleans, such as in the `case when` statements above) - - ❌ **Joins** — the goal of staging models is to clean and prepare individual source conformed concepts for downstream usage. We're creating the most useful version of a source system table, which we can use as a new modular component for our project. In our experience, joins are almost always a bad idea here — they create immediate duplicated computation and confusing relationships that ripple downstream — there are occasionally exceptions though (see [base models](guides/best-practices/how-we-structure/2-staging#staging-other-considerations) below). + - ❌ **Joins** — the goal of staging models is to clean and prepare individual source-conformed concepts for downstream usage. We're creating the most useful version of a source system table, which we can use as a new modular component for our project. In our experience, joins are almost always a bad idea here — they create immediate duplicated computation and confusing relationships that ripple downstream — there are occasionally exceptions though (refer to [base models](#staging-other-considerations) for more info). - ❌ **Aggregations** — aggregations entail grouping, and we're not doing that at this stage. Remember - staging models are your place to create the building blocks you’ll use all throughout the rest of your project — if we start changing the grain of our tables by grouping in this layer, we’ll lose access to source data that we’ll likely need at some point. We just want to get our individual concepts cleaned and ready for use, and will handle aggregating values downstream. -- ✅ **Materialized as views.** Looking at a partial view of our `dbt_project.yml` below, we can see that we’ve configured the entire staging directory to be materialized as views. As they’re not intended to be final artifacts themselves, but rather building blocks for later models, staging models should typically be materialized as views for two key reasons: - - Any downstream model (discussed more in [marts](/guides/best-practices/how-we-structure/4-marts)) referencing our staging models will always get the freshest data possible from all of the component views it’s pulling together and materializing +- ✅ **Materialized as views.** Looking at a partial view of our `dbt_project.yml` below, we can see that we’ve configured the entire staging directory to be materialized as views. As they’re not intended to be final artifacts themselves, but rather building blocks for later models, staging models should typically be materialized as views for two key reasons: + + - Any downstream model (discussed more in [marts](/best-practices/how-we-structure/4-marts)) referencing our staging models will always get the freshest data possible from all of the component views it’s pulling together and materializing - It avoids wasting space in the warehouse on models that are not intended to be queried by data consumers, and thus do not need to perform as quickly or efficiently ```yaml # dbt_project.yml - + models: jaffle_shop: staging: +materialized: view ``` -- Staging models are the only place we'll use the [`source` macro](/docs/build/sources), and our staging models should have a 1-to-1 relationship to our source tables. That means for each source system table we’ll have a single staging model referencing it, acting as its entry point — *staging* it — for use downstream. +- Staging models are the only place we'll use the [`source` macro](/docs/build/sources), and our staging models should have a 1-to-1 relationship to our source tables. That means for each source system table we’ll have a single staging model referencing it, acting as its entry point — _staging_ it — for use downstream. :::tip Don’t Repeat Yourself. Staging models help us keep our code DRY. dbt's modular, reusable structure means we can, and should, push any transformations that we’ll always want to use for a given component model as far upstream as possible. This saves us from potentially wasting code, complexity, and compute doing the same transformation more than once. For instance, if we know we always want our monetary values as floats in dollars, but the source system is integers and cents, we want to do the division and type casting as early as possible so that we can reference it rather than redo it repeatedly downstream. @@ -128,94 +129,96 @@ This is a welcome change for many of us who have become used to applying the sam ### Staging: Other considerations - **Base models when joins are necessary to stage concepts.** Sometimes, in order to maintain a clean and DRY staging layer we do need to implement some joins to create a solid concept for our building blocks. In these cases, we recommend creating a sub-directory in the staging directory for the source system in question and building `base` models. These have all the same properties that would normally be in the staging layer, they will directly source the raw data and do the non-joining transformations, then in the staging models we’ll join the requisite base models. The most common use cases for building a base layer under a staging folder are: + - ✅ **Joining in separate delete tables**. Sometimes a source system might store deletes in a separate table. Typically we’ll want to make sure we can mark or filter out deleted records for all our component models, so we’ll need to join these delete records up to any of our entities that follow this pattern. This is the example shown below to illustrate. ```sql -- base_jaffle_shop__customers.sql - + with - + source as ( - + select * from {{ source('jaffle_shop','customers') }} - + ), - + customers as ( - + select id as customer_id, first_name, last_name - + from source - + ) - + select * from customers ``` ```sql -- base_jaffle_shop__deleted_customers.sql - + with - + source as ( - + select * from {{ source('jaffle_shop','customer_deletes') }} - + ), - + deleted_customers as ( - + select id as customer_id, deleted as deleted_at - + from source - + ) - + select * from deleted_customers ``` ```sql -- stg_jaffle_shop__customers.sql - + with - + customers as ( - + select * from {{ ref('base_jaffle_shop__customers') }} - + ), - + deleted_customers as ( - + select * from {{ ref('base_jaffle_shop__deleted_customers') }} - + ), - + join_and_mark_deleted_customers as ( - + select customers.*, case when deleted_customers.deleted_at is not null then true else false end as is_deleted - + from customers - + left join deleted_customers on customers.customer_id = deleted_customers.customer_id - + ) - + select * from join_and_mark_deleted_customers ``` - - ✅ **Unioning disparate but symmetrical sources**. A typical example here would be if you operate multiple ecommerce platforms in various territories via a SaaS platform like Shopify. You would have perfectly identical schemas, but all loaded separately into your warehouse. In this case, it’s easier to reason about our orders if *all* of our shops are unioned together, so we’d want to handle the unioning in a base model before we carry on with our usual staging model transformations on the (now complete) set — you can dig into [more detail on this use case here](https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921). -- **[Codegen](https://github.com/dbt-labs/dbt-codegen) to automate staging table generation.** It’s very good practice to learn to write staging models by hand, they’re straightforward and numerous, so they can be an excellent way to absorb the dbt style of writing SQL. Also, we’ll invariably find ourselves needing to add special elements to specific models at times — for instance, in one of the situations above that require base models — so it’s helpful to deeply understand how they work. Once that understanding is established though, because staging models are built largely following the same rote patterns and need to be built 1-to-1 for each source table in a source system, it’s preferable to start automating their creation. For this, we have the [codegen](https://github.com/dbt-labs/dbt-codegen) package. This will let you automatically generate all the source YAML and staging model boilerplate to speed up this step, and we recommend using it in every project. + - ✅ **Unioning disparate but symmetrical sources**. A typical example here would be if you operate multiple ecommerce platforms in various territories via a SaaS platform like Shopify. You would have perfectly identical schemas, but all loaded separately into your warehouse. In this case, it’s easier to reason about our orders if _all_ of our shops are unioned together, so we’d want to handle the unioning in a base model before we carry on with our usual staging model transformations on the (now complete) set — you can dig into [more detail on this use case here](https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921). + +- **[Codegen](https://github.com/dbt-labs/dbt-codegen) to automate staging table generation.** It’s very good practice to learn to write staging models by hand, they’re straightforward and numerous, so they can be an excellent way to absorb the dbt style of writing SQL. Also, we’ll invariably find ourselves needing to add special elements to specific models at times — for instance, in one of the situations above that require base models — so it’s helpful to deeply understand how they work. Once that understanding is established though, because staging models are built largely following the same rote patterns and need to be built 1-to-1 for each source table in a source system, it’s preferable to start automating their creation. For this, we have the [codegen](https://github.com/dbt-labs/dbt-codegen) package. This will let you automatically generate all the source YAML and staging model boilerplate to speed up this step, and we recommend using it in every project. - **Utilities folder.** While this is not in the `staging` folder, it’s useful to consider as part of our fundamental building blocks. The `models/utilities` directory is where we can keep any general purpose models that we generate from macros or based on seeds that provide tools to help us do our modeling, rather than data to model itself. The most common use case is a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source) generated with [the dbt utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). :::info Development flow versus DAG order. diff --git a/website/docs/guides/best-practices/how-we-structure/3-intermediate.md b/website/docs/best-practices/how-we-structure/3-intermediate.md similarity index 83% rename from website/docs/guides/best-practices/how-we-structure/3-intermediate.md rename to website/docs/best-practices/how-we-structure/3-intermediate.md index 5e1db61c49f..0cf44d3cccc 100644 --- a/website/docs/guides/best-practices/how-we-structure/3-intermediate.md +++ b/website/docs/best-practices/how-we-structure/3-intermediate.md @@ -12,7 +12,7 @@ Once we’ve got our atoms ready to work with, we’ll set about bringing them t Let’s take a look at the intermediate layer of our project to understand the purpose of this stage more concretely. -```markdown +```shell models/intermediate └── finance ├── _int_finance__models.yml @@ -22,10 +22,10 @@ models/intermediate - **Folders** - ✅ **Subdirectories based on business groupings.** Much like the staging layer, we’ll house this layer of models inside their own `intermediate` subfolder. Unlike the staging layer, here we shift towards being business-conformed, splitting our models up into subdirectories not by their source system, but by their area of business concern. - **File names** - - `✅ int_[entity]s_[verb]s.sql` - the variety of transformations that can happen inside of the intermediate layer makes it harder to dictate strictly how to name them. The best guiding principle is to think about *verbs* (e.g. `pivoted`, `aggregated_to_user`, `joined`, `fanned_out_by_quantity`, `funnel_created`, etc.) in the intermediate layer. In our example project, we use an intermediate model to pivot payments out to the order grain, so we name our model `int_payments_pivoted_to_orders`. It’s easy for anybody to quickly understand what’s happening in that model, even if they don’t know [SQL](https://mode.com/sql-tutorial/). That clarity is worth the long file name. It’s important to note that we’ve dropped the double underscores at this layer. In moving towards business-conformed concepts, we no longer need to separate a system and an entity and simply reference the unified entity if possible. In cases where you need intermediate models to operate at the source system level (e.g. `int_shopify__orders_summed`, `int_core__orders_summed` which you would later union), you’d preserve the double underscores. Some people like to separate the entity and verbs with double underscores as well. That’s a matter of preference, but in our experience, there is often an intrinsic connection between entities and verbs in this layer that make that difficult to maintain. + - `✅ int_[entity]s_[verb]s.sql` - the variety of transformations that can happen inside of the intermediate layer makes it harder to dictate strictly how to name them. The best guiding principle is to think about _verbs_ (e.g. `pivoted`, `aggregated_to_user`, `joined`, `fanned_out_by_quantity`, `funnel_created`, etc.) in the intermediate layer. In our example project, we use an intermediate model to pivot payments out to the order grain, so we name our model `int_payments_pivoted_to_orders`. It’s easy for anybody to quickly understand what’s happening in that model, even if they don’t know [SQL](https://mode.com/sql-tutorial/). That clarity is worth the long file name. It’s important to note that we’ve dropped the double underscores at this layer. In moving towards business-conformed concepts, we no longer need to separate a system and an entity and simply reference the unified entity if possible. In cases where you need intermediate models to operate at the source system level (e.g. `int_shopify__orders_summed`, `int_core__orders_summed` which you would later union), you’d preserve the double underscores. Some people like to separate the entity and verbs with double underscores as well. That’s a matter of preference, but in our experience, there is often an intrinsic connection between entities and verbs in this layer that make that difficult to maintain. :::tip Don’t over-optimize too early! -The example project is very simple for illustrative purposes. This level of division in our post-staging layers is probably unnecessary when dealing with these few models. Remember, our goal is a *single* *source of truth.* We don’t want finance and marketing operating on separate `orders` models, we want to use our dbt project as a means to bring those definitions together! As such, don’t split and optimize too early. If you have less than 10 marts models and aren’t having problems developing and using them, feel free to forego subdirectories completely (except in the staging layer, where you should always implement them as you add new source systems to your project) until the project has grown to really need them. Using dbt is always about bringing simplicity to complexity. +The example project is very simple for illustrative purposes. This level of division in our post-staging layers is probably unnecessary when dealing with these few models. Remember, our goal is a _single_ _source of truth._ We don’t want finance and marketing operating on separate `orders` models, we want to use our dbt project as a means to bring those definitions together! As such, don’t split and optimize too early. If you have less than 10 marts models and aren’t having problems developing and using them, feel free to forego subdirectories completely (except in the staging layer, where you should always implement them as you add new source systems to your project) until the project has grown to really need them. Using dbt is always about bringing simplicity to complexity. ::: ### Intermediate: Models @@ -36,27 +36,27 @@ Below is the lone intermediate model from our small example project. This repres -- int_payments_pivoted_to_orders.sql {%- set payment_methods = ['bank_transfer','credit_card','coupon','gift_card'] -%} - -with + +with payments as ( select * from {{ ref('stg_stripe__payments') }} ), - + pivot_and_aggregate_payments_to_order_grain as ( - + select - order_id, + order_id, {% for payment_method in payment_methods -%} - + sum( case when payment_method = '{{ payment_method }}' and - status = 'success' - then amount - else 0 + status = 'success' + then amount + else 0 end ) as {{ payment_method }}_amount, @@ -68,7 +68,7 @@ pivot_and_aggregate_payments_to_order_grain as ( group by 1 ) - + select * from pivot_and_aggregate_payments_to_order_grain ``` @@ -77,15 +77,15 @@ select * from pivot_and_aggregate_payments_to_order_grain - ✅ **Materialized as views in a custom schema with special permissions.** A more robust option is to materialize your intermediate models as views in a specific [custom schema](/docs/build/custom-schemas), outside of your main production schema. This gives you added insight into development and easier troubleshooting as the number and complexity of your models grows, while remaining easy to implement and taking up negligible space. :::tip Keep your warehouse tidy! -There are three interfaces to the organizational knowledge graph we’re encoding into dbt: the DAG, the files and folder structure of our codebase, and the output into the warehouse. As such, it’s really important that we consider that output intentionally! Think of the schemas, tables, and views we’re creating in the warehouse as *part of the UX,* in addition to the dashboards, ML, apps, and other use cases you may be targeting for the data. Ensuring that our output is named and grouped well, and that models not intended for broad use are either not materialized or built into special areas with specific permissions is crucial to achieving this. +There are three interfaces to the organizational knowledge graph we’re encoding into dbt: the DAG, the files and folder structure of our codebase, and the output into the warehouse. As such, it’s really important that we consider that output intentionally! Think of the schemas, tables, and views we’re creating in the warehouse as _part of the UX,_ in addition to the dashboards, ML, apps, and other use cases you may be targeting for the data. Ensuring that our output is named and grouped well, and that models not intended for broad use are either not materialized or built into special areas with specific permissions is crucial to achieving this. ::: - Intermediate models’ purposes, as these serve to break up complexity from our marts models, can take as many forms as [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) might require. Some of the most common use cases of intermediate models include: - + - ✅ **Structural simplification.** Bringing together a reasonable number (typically 4 to 6) of entities or concepts (staging models, or perhaps other intermediate models) that will be joined with another similarly purposed intermediate model to generate a mart — rather than have 10 joins in our mart, we can join two intermediate models that each house a piece of the complexity, giving us increased readability, flexibility, testing surface area, and insight into our components. - ✅ **Re-graining.** Intermediate models are often used to fan out or collapse models to the right composite grain — if we’re building a mart for `order_items` that requires us to fan out our `orders` based on the `quantity` column, creating a new single row for each item, this would be ideal to do in a specific intermediate model to maintain clarity in our mart and more easily view that our grain is correct before we mix it with other components. - ✅ **Isolating complex operations.** It’s helpful to move any particularly complex or difficult to understand pieces of logic into their own intermediate models. This not only makes them easier to refine and troubleshoot, but simplifies later models that can reference this concept in a more clearly readable way. For example, in the `quantity` fan out example above, we benefit by isolating this complex piece of logic so we can quickly debug and thoroughly test that transformation, and downstream models can reference `order_items` in a way that’s intuitively easy to grasp. :::tip Narrow the DAG, widen the tables. -Until we get to the marts layer and start building our various outputs, we ideally want our DAG to look like an arrowhead pointed right. As we move from source-conformed to business-conformed, we’re also moving from numerous, narrow, isolated concepts to fewer, wider, joined concepts. We’re bringing our components together into wider, richer concepts, and that creates this shape in our DAG. This way when we get to the marts layer we have a robust set of components that can quickly and easily be put into any configuration to answer a variety of questions and serve specific needs. One rule of thumb to ensure you’re following this pattern on an individual model level is allowing multiple *inputs* to a model, but **not** multiple *outputs*. Several arrows going *into* our post-staging models is great and expected, several arrows coming *out* is a red flag. There are absolutely situations where you need to break this rule, but it’s something to be aware of, careful about, and avoid when possible. +Until we get to the marts layer and start building our various outputs, we ideally want our DAG to look like an arrowhead pointed right. As we move from source-conformed to business-conformed, we’re also moving from numerous, narrow, isolated concepts to fewer, wider, joined concepts. We’re bringing our components together into wider, richer concepts, and that creates this shape in our DAG. This way when we get to the marts layer we have a robust set of components that can quickly and easily be put into any configuration to answer a variety of questions and serve specific needs. One rule of thumb to ensure you’re following this pattern on an individual model level is allowing multiple _inputs_ to a model, but **not** multiple _outputs_. Several arrows going _into_ our post-staging models is great and expected, several arrows coming _out_ is a red flag. There are absolutely situations where you need to break this rule, but it’s something to be aware of, careful about, and avoid when possible. ::: diff --git a/website/docs/guides/best-practices/how-we-structure/4-marts.md b/website/docs/best-practices/how-we-structure/4-marts.md similarity index 88% rename from website/docs/guides/best-practices/how-we-structure/4-marts.md rename to website/docs/best-practices/how-we-structure/4-marts.md index 0e22d036e58..e7a0d35c342 100644 --- a/website/docs/guides/best-practices/how-we-structure/4-marts.md +++ b/website/docs/best-practices/how-we-structure/4-marts.md @@ -3,13 +3,17 @@ title: "Marts: Business-defined entities" id: "4-marts" --- -This is the layer where everything comes together and we start to arrange all of our atoms (staging models) and molecules (intermediate models) into full-fledged cells that have identity and purpose. We sometimes like to call this the *entity* *layer* or *concept layer*, to emphasize that all our marts are meant to represent a specific entity or concept at its unique grain. For instance, an order, a customer, a territory, a click event, a payment — each of these would be represented with a distinct mart, and each row would represent a discrete instance of these concepts. Unlike in a traditional Kimball star schema though, in modern data warehousing — where storage is cheap and compute is expensive — we’ll happily borrow and add any and all data from other concepts that are relevant to answering questions about the mart’s core entity. Building the same data in multiple places, as we do with `orders` in our `customers` mart example below, is more efficient in this paradigm than repeatedly rejoining these concepts (this is a basic definition of denormalization in this context). Let’s take a look at how we approach this first layer intended expressly for exposure to end users. +:::info +Our guidance here diverges if you use the dbt Semantic Layer. In a project without the Semantic Layer we recommend you denormalize heavily, per the best practices below. On the other hand, if you're using the Semantic Layer, we want to stay as normalized as possible to allow MetricFlow the most flexibility. Guidance for marts in a Semantic Layer context is on the next page. +::: + +This is the layer where everything comes together and we start to arrange all of our atoms (staging models) and molecules (intermediate models) into full-fledged cells that have identity and purpose. We sometimes like to call this the _entity_ _layer_ or _concept layer_, to emphasize that all our marts are meant to represent a specific entity or concept at its unique grain. For instance, an order, a customer, a territory, a click event, a payment — each of these would be represented with a distinct mart, and each row would represent a discrete instance of these concepts. Unlike in a traditional Kimball star schema though, in modern data warehousing — where storage is cheap and compute is expensive — we’ll happily borrow and add any and all data from other concepts that are relevant to answering questions about the mart’s core entity. Building the same data in multiple places, as we do with `orders` in our `customers` mart example below, is more efficient in this paradigm than repeatedly rejoining these concepts (this is a basic definition of denormalization in this context). Let’s take a look at how we approach this first layer intended expressly for exposure to end users. ### Marts: Files and folders The last layer of our core transformations is below, providing models for both `finance` and `marketing` departments. -```markdown +```shell models/marts ├── finance │ ├── _finance__models.yml @@ -24,7 +28,7 @@ models/marts ✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart `customers`, `orders`. Note that for pure marts, there should not be a time dimension (`orders_per_day`) here, that is typically best captured via metrics. -❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as *separate* concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. +❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as _separate_ concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. ### Marts: Models @@ -33,7 +37,7 @@ Finally we’ll take a look at the best practices for models within the marts di ```sql -- orders.sql -with +with orders as ( @@ -68,7 +72,7 @@ select * from orders_and_payments_joined ```sql -- customers.sql -with +with customers as ( @@ -117,21 +121,15 @@ customers_and_customer_orders_joined as ( select * from customers_and_customer_orders_joined ``` -- ✅ **Materialized as tables or incremental models.** Once we reach the marts layer, it’s time to start building not just our logic into the warehouse, but the data itself. This gives end users much faster performance for these later models that are actually designed for their use, and saves us costs recomputing these entire chains of models every time somebody refreshes a dashboard or runs a regression in python. A good general rule of thumb regarding materialization is to always start with a view (as it takes up essentially no storage and always gives you up-to-date results), once that view takes too long to practically *query*, build it into a table, and finally once that table takes too long to *build* and is slowing down your runs, [configure it as an incremental model](https://docs.getdbt.com/docs/build/incremental-models/). As always, start simple and only add complexity as necessary. The models with the most data and compute-intensive transformations should absolutely take advantage of dbt’s excellent incremental materialization options, but rushing to make all your marts models incremental by default will introduce superfluous difficulty. We recommend reading this [classic post from Tristan on the limits of incremental modeling](https://discourse.getdbt.com/t/on-the-limits-of-incrementality/303). +- ✅ **Materialized as tables or incremental models.** Once we reach the marts layer, it’s time to start building not just our logic into the warehouse, but the data itself. This gives end users much faster performance for these later models that are actually designed for their use, and saves us costs recomputing these entire chains of models every time somebody refreshes a dashboard or runs a regression in python. A good general rule of thumb regarding materialization is to always start with a view (as it takes up essentially no storage and always gives you up-to-date results), once that view takes too long to practically _query_, build it into a table, and finally once that table takes too long to _build_ and is slowing down your runs, [configure it as an incremental model](https://docs.getdbt.com/docs/build/incremental-models/). As always, start simple and only add complexity as necessary. The models with the most data and compute-intensive transformations should absolutely take advantage of dbt’s excellent incremental materialization options, but rushing to make all your marts models incremental by default will introduce superfluous difficulty. We recommend reading this [classic post from Tristan on the limits of incremental modeling](https://discourse.getdbt.com/t/on-the-limits-of-incrementality/303). - ✅ **Wide and denormalized.** Unlike old school warehousing, in the modern data stack storage is cheap and it’s compute that is expensive and must be prioritized as such, packing these into very wide denormalized concepts that can provide everything somebody needs about a concept as a goal. - ❌ **Too many joins in one mart.** One good rule of thumb when building dbt transformations is to avoid bringing together too many concepts in a single mart. What constitutes ‘too many’ can vary. If you need to bring 8 staging models together with nothing but simple joins, that might be fine. Conversely, if you have 4 concepts you’re weaving together with some complex and computationally heavy window functions, that could be too much. You need to weigh the number of models you’re joining against the complexity of the logic within the mart, and if it’s too much to read through and build a clear mental model of then look to modularize. While this isn’t a hard rule, if you’re bringing together more than 4 or 5 concepts to create your mart, you may benefit from adding some intermediate models for added clarity. Two intermediate models that bring together three concepts each, and a mart that brings together those two intermediate models, will typically result in a much more readable chain of logic than a single mart with six joins. - ✅ **Build on separate marts thoughtfully.** While we strive to preserve a narrowing DAG up to the marts layer, once here things may start to get a little less strict. A common example is passing information between marts at different grains, as we saw above, where we bring our `orders` mart into our `customers` marts to aggregate critical order data into a `customer` grain. Now that we’re really ‘spending’ compute and storage by actually building the data in our outputs, it’s sensible to leverage previously built resources to speed up and save costs on outputs that require similar data, versus recomputing the same views and CTEs from scratch. The right approach here is heavily dependent on your unique DAG, models, and goals — it’s just important to note that using a mart in building another, later mart is okay, but requires careful consideration to avoid wasted resources or circular dependencies. :::tip Marts are entity-grained. -The most important aspect of marts is that they contain all of the useful data about a *particular entity* at a granular level. That doesn’t mean we don’t bring in lots of other entities and concepts, like tons of `user` data into our `orders` mart, we do! It just means that individual `orders` remain the core grain of our table. If we start grouping `users` and `orders` along a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source), into something like `user_orders_per_day`, we’re moving past marts into *metrics*. +The most important aspect of marts is that they contain all of the useful data about a _particular entity_ at a granular level. That doesn’t mean we don’t bring in lots of other entities and concepts, like tons of `user` data into our `orders` mart, we do! It just means that individual `orders` remain the core grain of our table. If we start grouping `users` and `orders` along a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source), into something like `user_orders_per_day`, we’re moving past marts into _metrics_. ::: ### Marts: Other considerations - **Troubleshoot via tables.** While stacking views and ephemeral models up until our marts — only building data into the warehouse at the end of a chain when we have the models we really want end users to work with — is ideal in production, it can present some difficulties in development. Particularly, certain errors may seem to be surfacing in our later models that actually stem from much earlier dependencies in our model chain (ancestor models in our DAG that are built before the model throws the errors). If you’re having trouble pinning down where or what a database error is telling you, it can be helpful to temporarily build a specific chain of models as tables so that the warehouse will throw the error where it’s actually occurring. -- **After marts: the activation layer.** In the same way that our staging models are building blocks for our marts, that also offer us direct views into specific source data, our marts are building blocks for our final outputs that also offer direct views into specific ideas. You can use marts directly, but they are equally important as components for building models in the *activation layer* after marts. This is a deep and fast-evolving topic, so we’ll cover this in a separate forthcoming guide that dives into: - - Metrics - - Reverse ETL - - Reporting and dashboards - - Data science and ML - - [Exposures](https://docs.getdbt.com/docs/build/exposures) (how we tie our dbt DAG into all of the above) diff --git a/website/docs/best-practices/how-we-structure/5-semantic-layer-marts.md b/website/docs/best-practices/how-we-structure/5-semantic-layer-marts.md new file mode 100644 index 00000000000..62e07a72e36 --- /dev/null +++ b/website/docs/best-practices/how-we-structure/5-semantic-layer-marts.md @@ -0,0 +1,48 @@ +--- +title: "Marts for the Semantic Layer" +id: "5-semantic-layer-marts" +--- + +The Semantic Layer alters some fundamental principles of how you organize your project. Using dbt without the Semantic Layer necessitates creating the most useful combinations of your building block components into wide, denormalized marts. On the other hand, the Semantic Layer leverages MetricFlow to denormalize every possible combination of components we've encoded dynamically. As such we're better served to bring more normalized models through from the logical layer into the Semantic Layer to maximize flexibility. This section will assume familiarity with the best practices laid out in the [How we build our metrics](/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) guide, so check that out first for a more hands-on introduction to the Semantic Layer. + +## Semantic Layer: Files and folders + +- 2️⃣ There are two major factors that alter our recommendations for the Semantic Layer: + - 📝 There is **more YAML** in the form of **semantic models and metrics**. + - ⏫ We may **use a staging model directly** if it forms a complete normalized component, and it will not have a mart at all. +- 💪 This combination means models at **both the staging and marts layer** may participate in the Semantic Layer and use **more powerful, expansive YAML configuration**. +- 🔁 Given this, for projects using the Semantic Layer we recommend a **YAML-file-per-model approach**, as below. + +```shell +models +├── marts +│   ├── customers.sql +│   ├── customers.yml +│   ├── orders.sql +│   └── orders.yml +└── staging + ├── __sources.yml + ├── stg_customers.sql + ├── stg_customers.yml + ├── stg_locations.sql + ├── stg_locations.yml + ├── stg_order_items.sql + ├── stg_order_items.yml + ├── stg_orders.sql + ├── stg_orders.yml + ├── stg_products.sql + ├── stg_products.yml + ├── stg_supplies.sql + └── stg_supplies.yml +``` + +## When to make a mart + +- ❓ If we can go directly to staging models and it's better to serve normalized models to the Semantic Layer, then when, where, and why would we make a mart? + - 🕰️ We have models that have measures but no time dimension to aggregate against. The details of this are laid out in the [Semantic Layer guide](/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) but in short, we need a time dimension to aggregate against in MetricFlow. Dimensional tables that + - 🧱 We want to **materialize** our model in various ways. + - 👯 We want to **version** our model. + - 🛒 We have various related models that make more sense as **one wider component**. + - 1️⃣ We have similar models across multiple data sources that make more sense **unioned together**. + - ⌚ We have models in our project we **need to time to refactor** but want to serve up to the Semantic Layer quickly. +- 🌍 Any of the above and more are great reasons to build a mart. Analytics engineering is about **creativity and problem solving**, so these are not prescriptive rules, **there are many reasons to build marts** in any project. The most important takeaway is that you don't **_have to_** if you're using the Semantic Layer. diff --git a/website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md b/website/docs/best-practices/how-we-structure/6-the-rest-of-the-project.md similarity index 93% rename from website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md rename to website/docs/best-practices/how-we-structure/6-the-rest-of-the-project.md index 2a6c7399adb..4082f92b932 100644 --- a/website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md +++ b/website/docs/best-practices/how-we-structure/6-the-rest-of-the-project.md @@ -1,6 +1,6 @@ --- title: "The rest of the project" -id: "5-the-rest-of-the-project" +id: "6-the-rest-of-the-project" description: The rest of the project. displayText: The rest of the project. hoverSnippet: The rest of the project. @@ -10,7 +10,7 @@ hoverSnippet: The rest of the project. So far we’ve focused on the `models` folder, the primary directory of our dbt project. Next, we’ll zoom out and look at how the rest of our project files and folders fit in with this structure, starting with how we approach YAML configuration files. -```markdown +```shell models ├── intermediate │ └── finance @@ -51,7 +51,7 @@ When structuring your YAML configuration files in a dbt project, you want to bal - YAML files don’t need unique names in the way that SQL model files do, but including the directory (instead of simply `_sources.yml` in each folder), means you can fuzzy find the right file more quickly. - We’ve recommended several different naming conventions over the years, most recently calling these `schema.yml` files. We’ve simplified to recommend that these simply be labelled based on the YAML dictionary that they contain. - If you utilize [doc blocks](https://docs.getdbt.com/docs/collaborate/documentation#using-docs-blocks) in your project, we recommend following the same pattern, and creating a `_[directory]__docs.md` markdown file per directory containing all your doc blocks for that folder of models. -- ❌ **Config per project.** Some people put *all* of their source and model YAML into one file. While you can technically do this, and while it certainly simplifies knowing what file the config you’re looking for will be in (as there is only one file), it makes it much harder to find specific configurations within that file. We recommend balancing those two concerns. +- ❌ **Config per project.** Some people put _all_ of their source and model YAML into one file. While you can technically do this, and while it certainly simplifies knowing what file the config you’re looking for will be in (as there is only one file), it makes it much harder to find specific configurations within that file. We recommend balancing those two concerns. - ⚠️ **Config per model.** On the other end of the spectrum, some people prefer to create one YAML file per model. This presents less of an issue than a single monolith file, as you can quickly search for files, know exactly where specific configurations exist, spot models without configs (and thus without tests) by looking at the file tree, and various other advantages. In our opinion, the extra files, tabs, and windows this requires creating, copying from, pasting to, closing, opening, and managing creates a somewhat slower development experience that outweighs the benefits. Defining config per directory is the most balanced approach for most projects, but if you have compelling reasons to use config per model, there are definitely some great projects that follow this paradigm. - ✅ **Cascade configs.** Leverage your `dbt_project.yml` to set default configurations at the directory level. Use the well-organized folder structure we’ve created thus far to define the baseline schemas and materializations, and use dbt’s cascading scope priority to define variations to this. For example, as below, define your marts to be materialized as tables by default, define separate schemas for our separate subfolders, and any models that need to use incremental materialization can be defined at the model level. @@ -73,12 +73,12 @@ models: ``` :::tip Define your defaults. -One of the many benefits this consistent approach to project structure confers to us is this ability to cascade default behavior. Carefully organizing our folders and defining configuration at that level whenever possible frees us from configuring things like schema and materialization in every single model (not very DRY!) — we only need to configure exceptions to our general rules. Tagging is another area this principle comes into play. Many people new to dbt will rely on tags rather than a rigorous folder structure, and quickly find themselves in a place where every model *requires* a tag. This creates unnecessary complexity. We want to lean on our folders as our primary selectors and grouping mechanism, and use tags to define groups that are *exceptions.* A folder-based selection like **`dbt build --select marts.marketing` is much simpler than trying to tag every marketing-related model, hoping all developers remember to add that tag for new models, and using `dbt build --select tag:marketing`. +One of the many benefits this consistent approach to project structure confers to us is this ability to cascade default behavior. Carefully organizing our folders and defining configuration at that level whenever possible frees us from configuring things like schema and materialization in every single model (not very DRY!) — we only need to configure exceptions to our general rules. Tagging is another area this principle comes into play. Many people new to dbt will rely on tags rather than a rigorous folder structure, and quickly find themselves in a place where every model _requires_ a tag. This creates unnecessary complexity. We want to lean on our folders as our primary selectors and grouping mechanism, and use tags to define groups that are _exceptions._ A folder-based selection like \*\*`dbt build --select marts.marketing` is much simpler than trying to tag every marketing-related model, hoping all developers remember to add that tag for new models, and using `dbt build --select tag:marketing`. ::: ### How we use the other folders -```yaml +```shell jaffle_shop ├── analyses ├── seeds @@ -88,7 +88,7 @@ jaffle_shop │ └── cents_to_dollars.sql ├── snapshots └── tests - └── assert_positive_value_for_total_amount.sql +└── assert_positive_value_for_total_amount.sql ``` We’ve focused heavily thus far on the primary area of action in our dbt project, the `models` folder. As you’ve probably observed though, there are several other folders in our project. While these are, by design, very flexible to your needs, we’ll discuss the most common use cases for these other folders to help get you started. @@ -111,6 +111,6 @@ One important, growing consideration in the analytics engineering ecosystem is h ## Final considerations -Overall, consistency is more important than any of these specific conventions. As your project grows and your experience with dbt deepens, you will undoubtedly find aspects of the above structure you want to change. While we recommend this approach for the majority of projects, every organization is unique! The only dogmatic advice we’ll put forward here is that when you find aspects of the above structure you wish to change, think intently about your reasoning and document for your team *how* and *why* you are deviating from these conventions. To that end, we highly encourage you to fork this guide and add it to your project’s README, wiki, or docs so you can quickly create and customize those artifacts. +Overall, consistency is more important than any of these specific conventions. As your project grows and your experience with dbt deepens, you will undoubtedly find aspects of the above structure you want to change. While we recommend this approach for the majority of projects, every organization is unique! The only dogmatic advice we’ll put forward here is that when you find aspects of the above structure you wish to change, think intently about your reasoning and document for your team _how_ and _why_ you are deviating from these conventions. To that end, we highly encourage you to fork this guide and add it to your project’s README, wiki, or docs so you can quickly create and customize those artifacts. Finally, we emphasize that this guide is a living document! It will certainly change and grow as dbt and dbt Labs evolve. We invite you to join in — discuss, comment, and contribute regarding suggested changes or new elements to cover. diff --git a/website/docs/guides/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md b/website/docs/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md similarity index 100% rename from website/docs/guides/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md rename to website/docs/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md diff --git a/website/docs/guides/best-practices/how-we-style/1-how-we-style-our-dbt-models.md b/website/docs/best-practices/how-we-style/1-how-we-style-our-dbt-models.md similarity index 100% rename from website/docs/guides/best-practices/how-we-style/1-how-we-style-our-dbt-models.md rename to website/docs/best-practices/how-we-style/1-how-we-style-our-dbt-models.md diff --git a/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md b/website/docs/best-practices/how-we-style/2-how-we-style-our-sql.md similarity index 89% rename from website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md rename to website/docs/best-practices/how-we-style/2-how-we-style-our-sql.md index 1ea9c064d74..8c61e63b888 100644 --- a/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md +++ b/website/docs/best-practices/how-we-style/2-how-we-style-our-sql.md @@ -6,7 +6,10 @@ id: 2-how-we-style-our-sql ## Basics - ☁️ Use [SQLFluff](https://sqlfluff.com/) to maintain these style rules automatically. - - Reference this [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) for the rules we use. + - Customize `.sqlfluff` configuration files to your needs. + - Refer to our [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) for the rules we use in our own projects. + + - Exclude files and directories by using a standard `.sqlfluffignore` file. Learn more about the syntax in the [.sqlfluffignore syntax docs](https://docs.sqlfluff.com/en/stable/configuration.html#id2). - 👻 Use Jinja comments (`{# #}`) for comments that should not be included in the compiled SQL. - ⏭️ Use trailing commas. - 4️⃣ Indents should be four spaces. @@ -22,7 +25,7 @@ id: 2-how-we-style-our-sql - 🔙 Fields should be stated before aggregates and window functions. - 🤏🏻 Aggregations should be executed as early as possible (on the smallest data set possible) before joining to another table to improve performance. -- 🔢 Ordering and grouping by a number (eg. group by 1, 2) is preferred over listing the column names (see [this classic rant](https://blog.getdbt.com/write-better-sql-a-defense-of-group-by-1/) for why). Note that if you are grouping by more than a few columns, it may be worth revisiting your model design. +- 🔢 Ordering and grouping by a number (eg. group by 1, 2) is preferred over listing the column names (see [this classic rant](https://www.getdbt.com/blog/write-better-sql-a-defense-of-group-by-1) for why). Note that if you are grouping by more than a few columns, it may be worth revisiting your model design. ## Joins diff --git a/website/docs/guides/best-practices/how-we-style/3-how-we-style-our-python.md b/website/docs/best-practices/how-we-style/3-how-we-style-our-python.md similarity index 100% rename from website/docs/guides/best-practices/how-we-style/3-how-we-style-our-python.md rename to website/docs/best-practices/how-we-style/3-how-we-style-our-python.md diff --git a/website/docs/guides/best-practices/how-we-style/4-how-we-style-our-jinja.md b/website/docs/best-practices/how-we-style/4-how-we-style-our-jinja.md similarity index 100% rename from website/docs/guides/best-practices/how-we-style/4-how-we-style-our-jinja.md rename to website/docs/best-practices/how-we-style/4-how-we-style-our-jinja.md diff --git a/website/docs/guides/best-practices/how-we-style/5-how-we-style-our-yaml.md b/website/docs/best-practices/how-we-style/5-how-we-style-our-yaml.md similarity index 100% rename from website/docs/guides/best-practices/how-we-style/5-how-we-style-our-yaml.md rename to website/docs/best-practices/how-we-style/5-how-we-style-our-yaml.md diff --git a/website/docs/best-practices/how-we-style/6-how-we-style-conclusion.md b/website/docs/best-practices/how-we-style/6-how-we-style-conclusion.md new file mode 100644 index 00000000000..24103861b97 --- /dev/null +++ b/website/docs/best-practices/how-we-style/6-how-we-style-conclusion.md @@ -0,0 +1,107 @@ +--- +title: Now it's your turn +id: 6-how-we-style-conclusion +--- + +## BYO Styles + +Now that you've seen how we style our dbt projects, it's time to build your own. Feel free to copy this guide and use it as a template for your own project. If you do, we'd love to hear about it! Reach out to us on [the Community Forum](https://discourse.getdbt.com/c/show-and-tell/22) or [Slack](https://www.getdbt.com/community) to share your style guide. We recommend co-locating your style guide with your code to make sure contributors can easily follow it. If you're using GitHub, you can add your style guide to your repository's wiki, or include it in your README. + +## Pre-commit hooks + +Lastly, to ensure your style guide's automated rules are being followed without additional mental overhead to your team, you can use [pre-commit hooks](https://pre-commit.com/) to automatically check your code for style violations (and often fix them automagically) before it's committed. This is a great way to make sure your style guide is followed by all contributors. We recommend implementing this once you've settled on and published your style guide, and your codebase is conforming to it. This will ensure that all future commits follow the style guide. You can find an excellent set of open source pre-commit hooks for dbt from the community [here in the dbt-checkpoint project](https://github.com/dbt-checkpoint/dbt-checkpoint). + +## Style guide template + +```markdown +# dbt Example Style Guide + +## SQL Style + +- Use lowercase keywords. +- Use trailing commas. + +## Model Organization + +Our models (typically) fit into two main categories:\ + +- Staging — Contains models that clean and standardize data. +- Marts — Contains models which combine or heavily transform data. + +Things to note: + +- There are different types of models that typically exist in each of the above categories. See [Model Layers](#model-layers) for more information. +- Read [How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) for an example and more details around organization. + +## Model Layers + +- Only models in `staging` should select from [sources](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources). +- Models not in the `staging` folder should select from [refs](https://docs.getdbt.com/reference/dbt-jinja-functions/ref). + +## Model File Naming and Coding + +- All objects should be plural. + Example: `stg_stripe__invoices.sql` vs. `stg_stripe__invoice.sql` + +- All models should use the naming convention `___`. See [this article](https://docs.getdbt.com/blog/stakeholder-friendly-model-names) for more information. + + - Models in the **staging** folder should use the source's name as the `` and the entity name as the `additional_context`. + + Examples: + + - seed_snowflake_spend.csv + - base_stripe\_\_invoices.sql + - stg_stripe\_\_customers.sql + - stg_salesforce\_\_customers.sql + - int_customers\_\_unioned.sql + - fct_orders.sql + +- Schema, table, and column names should be in `snake_case`. + +- Limit the use of abbreviations that are related to domain knowledge. An onboarding employee will understand `current_order_status` better than `current_os`. + +- Use names based on the _business_ rather than the source terminology. + +- Each model should have a primary key to identify the unique row and should be named `_id`. For example, `account_id`. This makes it easier to know what `id` is referenced in downstream joined models. + +- For `base` or `staging` models, columns should be ordered in categories, where identifiers are first and date/time fields are at the end. +- Date/time columns should be named according to these conventions: + + - Timestamps: `_at` + Format: UTC + Example: `created_at` + + - Dates: `_date` + Format: Date + Example: `created_date` + +- Booleans should be prefixed with `is_` or `has_`. + Example: `is_active_customer` and `has_admin_access` + +- Price/revenue fields should be in decimal currency (for example, `19.99` for $19.99; many app databases store prices as integers in cents). If a non-decimal currency is used, indicate this with suffixes. For example, `price_in_cents`. + +- Avoid using reserved words (such as [these](https://docs.snowflake.com/en/sql-reference/reserved-keywords.html) for Snowflake) as column names. + +- Consistency is key! Use the same field names across models where possible. For example, a key to the `customers` table should be named `customer_id` rather than `user_id`. + +## Model Configurations + +- Model configurations at the [folder level](https://docs.getdbt.com/reference/model-configs#configuring-directories-of-models-in-dbt_projectyml) should be considered (and if applicable, applied) first. +- More specific configurations should be applied at the model level [using one of these methods](https://docs.getdbt.com/reference/model-configs#apply-configurations-to-one-model-only). +- Models within the `marts` folder should be materialized as `table` or `incremental`. + - By default, `marts` should be materialized as `table` within `dbt_project.yml`. + - If switching to `incremental`, this should be specified in the model's configuration. + +## Testing + +- At a minimum, `unique` and `not_null` tests should be applied to the expected primary key of each model. + +## CTEs + +For more information about why we use so many CTEs, read [this glossary entry](https://docs.getdbt.com/terms/cte). + +- Where performance permits, CTEs should perform a single, logical unit of work. +- CTE names should be as verbose as needed to convey what they do. +- CTEs with confusing or noteable logic should be commented with SQL comments as you would with any complex functions and should be located above the CTE. +- CTEs duplicated across models should be pulled out and created as their own models. +``` diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-1-guide-overview.md b/website/docs/best-practices/materializations/materializations-guide-1-guide-overview.md similarity index 89% rename from website/docs/guides/best-practices/materializations/materializations-guide-1-guide-overview.md rename to website/docs/best-practices/materializations/materializations-guide-1-guide-overview.md index 209041b1df5..248b4c4749b 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-1-guide-overview.md +++ b/website/docs/best-practices/materializations/materializations-guide-1-guide-overview.md @@ -26,9 +26,9 @@ By the end of this guide you should have a solid understanding of: ### Prerequisites -- 📒 You’ll want to have worked through the [quickstart guide](/quickstarts) and have a project setup to work through these concepts. +- 📒 You’ll want to have worked through the [quickstart guide](/guides) and have a project setup to work through these concepts. - 🏃🏻‍♀️ Concepts like dbt runs, `ref()` statements, and models should be familiar to you. -- 🔧 [**Optional**] Reading through the [How we structure our dbt projects](guides/best-practices/how-we-structure/1-guide-overview) Guide will be beneficial for the last section of this guide, when we review best practices for materializations using the dbt project approach of staging models and marts. +- 🔧 [**Optional**] Reading through the [How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) Guide will be beneficial for the last section of this guide, when we review best practices for materializations using the dbt project approach of staging models and marts. ### Guiding principle diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-2-available-materializations.md b/website/docs/best-practices/materializations/materializations-guide-2-available-materializations.md similarity index 98% rename from website/docs/guides/best-practices/materializations/materializations-guide-2-available-materializations.md rename to website/docs/best-practices/materializations/materializations-guide-2-available-materializations.md index 54110b46385..9910e5f8269 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-2-available-materializations.md +++ b/website/docs/best-practices/materializations/materializations-guide-2-available-materializations.md @@ -19,7 +19,7 @@ Views and tables and incremental models, oh my! In this section we’ll start ge **Views and Tables are the two basic categories** of object that we can create across warehouses. They exist natively as types of objects in the warehouse, as you can see from this screenshot of Snowflake (depending on your warehouse the interface will look a little different). **Incremental models** and other materializations types are a little bit different. They tell dbt to **construct tables in a special way**. -![Tables and views in the browser on Snowflake.](/img/guides/best-practices/materializations/tables-and-views.png) +![Tables and views in the browser on Snowflake.](/img/best-practices/materializations/tables-and-views.png) ### Views diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md b/website/docs/best-practices/materializations/materializations-guide-3-configuring-materializations.md similarity index 95% rename from website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md rename to website/docs/best-practices/materializations/materializations-guide-3-configuring-materializations.md index 2f6c04bd35d..54f4443b600 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md +++ b/website/docs/best-practices/materializations/materializations-guide-3-configuring-materializations.md @@ -53,7 +53,7 @@ def model(dbt, session): :::info -🐍 **Not all adapters support python yet**, check the [docs here to be sure](docs/build/python-models#specific-data-platforms) before spending time writing python models. +🐍 **Not all adapters support python yet**, check the [docs here to be sure](/docs/build/python-models#specific-data-platforms) before spending time writing python models. ::: - Configuring a model to materialize as a `table` is simple, and the same as a `view` for both SQL and python models. diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md b/website/docs/best-practices/materializations/materializations-guide-4-incremental-models.md similarity index 97% rename from website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md rename to website/docs/best-practices/materializations/materializations-guide-4-incremental-models.md index c1a4cb3eb0e..71b24ef58f2 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md +++ b/website/docs/best-practices/materializations/materializations-guide-4-incremental-models.md @@ -29,7 +29,7 @@ We did our last `dbt build` job on `2022-01-31`, so any new orders since that ru - 🏔️ build the table from the **beginning of time again — a _table materialization_** - Simple and solid, if we can afford to do it (in terms of time, compute, and money — which are all directly correlated in a cloud warehouse). It’s the easiest and most accurate option. - 🤏 find a way to run **just new and updated rows since our previous run — _an_ _incremental materialization_** - - If we _can’t_ realistically afford to run the whole table — due to complex transformations or big source data, it takes too long — then we want to build incrementally. We want to just transform and add the row with id 567 below, _not_ the previous two with ids 123 and 456 that are already in the table. + - If we _can’t_ realistically afford to run the whole table — due to complex transformations or big source data, it takes too long — then we want to build incrementally. We want to just transform and add the row with id 567 below, _not_ the previous two with ids 123 and 234 that are already in the table. | order_id | order_status | customer_id | order_item_id | ordered_at | updated_at | | -------- | ------------ | ----------- | ------------- | ---------- | ---------- | @@ -76,7 +76,7 @@ So we’ve found a way to isolate the new rows we need to process. How then do w - 🌍  Lastly, if we’re building into a new environment and there’s **no previous run to reference**, or we need to **build the model from scratch.** Put another way, we’ll want a means to skip the incremental logic and transform all of our input data like a regular table if needed. - 😎 **Visualized below**, we’ve figured out how to get the red ‘new records’ portion selected, but we need to sort out the step to the right, where we stick those on to our model. -![Diagram visualizing how incremental models work](/img/guides/best-practices/materializations/incremental-diagram.png) +![Diagram visualizing how incremental models work](/img/best-practices/materializations/incremental-diagram.png) :::info 😌 Incremental models can be confusing at first, **take your time reviewing** this visual and the previous steps until you have a **clear mental model.** Be patient with yourself. This materialization will become second nature soon, but it’s tough at first. If you’re feeling confused the [dbt Community is here for you on the Forum and Slack](community/join). @@ -115,7 +115,7 @@ So we’re going to use an **if statement** to apply our cutoff filter **only wh Thankfully, we don’t have to dig into the guts of dbt to sort out each of these conditions individually. -- ⚙️  dbt provides us with a **macro [`is_incremental`](docs/build/incremental-models#understanding-the-is_incremental-macro)** that checks all of these conditions for this exact use case. +- ⚙️  dbt provides us with a **macro [`is_incremental`](/docs/build/incremental-models#understanding-the-is_incremental-macro)** that checks all of these conditions for this exact use case. - 🔀  By **wrapping our cutoff logic** in this macro, it will only get applied when the macro returns true for all of the above conditions. Let’s take a look at all these pieces together: diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-5-best-practices.md b/website/docs/best-practices/materializations/materializations-guide-5-best-practices.md similarity index 98% rename from website/docs/guides/best-practices/materializations/materializations-guide-5-best-practices.md rename to website/docs/best-practices/materializations/materializations-guide-5-best-practices.md index a2cb22d5755..268a326eed0 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-5-best-practices.md +++ b/website/docs/best-practices/materializations/materializations-guide-5-best-practices.md @@ -58,7 +58,7 @@ models: As we’ve learned, views store only the logic of the transformation in the warehouse, so our runs take only a couple seconds per model (or less). What happens when we go to query the data though? -![Long query time from Snowflake](/img/guides/best-practices/materializations/snowflake-query-timing.png) +![Long query time from Snowflake](/img/best-practices/materializations/snowflake-query-timing.png) Our marts are slow to query! diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-6-examining-builds.md b/website/docs/best-practices/materializations/materializations-guide-6-examining-builds.md similarity index 86% rename from website/docs/guides/best-practices/materializations/materializations-guide-6-examining-builds.md rename to website/docs/best-practices/materializations/materializations-guide-6-examining-builds.md index 07811b42594..0b18518d0bd 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-6-examining-builds.md +++ b/website/docs/best-practices/materializations/materializations-guide-6-examining-builds.md @@ -12,24 +12,24 @@ hoverSnippet: Read this guide to understand how to examine your builds in dbt. - ⌚ dbt keeps track of how **long each model took to build**, when it started, when it finished, its completion status (error, warn, or success), its materialization type, and _much_ more. - 🖼️ This information is stored in a couple files which dbt calls **artifacts**. - 📊 Artifacts contain a ton of information in JSON format, so aren’t easy to read, but **dbt Cloud** packages the most useful bits of information into a tidy **visualization** for you. -- ☁️ If you’re not using Cloud, we can still use the output of the **dbt CLI to understand our runs**. +- ☁️ If you’re not using Cloud, we can still use the output of the **dbt Core CLI to understand our runs**. ### Model Timing -That’s where dbt Cloud’s Model Timing visualization comes in extremely handy. If we’ve set up a [Job](/quickstarts/bigquery) in dbt Cloud to run our models, we can use the Model Timing tab to pinpoint our longest-running models. +That’s where dbt Cloud’s Model Timing visualization comes in extremely handy. If we’ve set up a [Job](/guides/bigquery) in dbt Cloud to run our models, we can use the Model Timing tab to pinpoint our longest-running models. -![dbt Cloud's Model Timing diagram](/img/guides/best-practices/materializations/model-timing-diagram.png) +![dbt Cloud's Model Timing diagram](/img/best-practices/materializations/model-timing-diagram.png) - 🧵 This view lets us see our **mapped out in threads** (up to 64 threads, we’re currently running with 4, so we get 4 tracks) over time. You can think of **each thread as a lane on a highway**. - ⌛ We can see above that `customer_status_histories` is **taking by far the most time**, so we may want to go ahead and **make that incremental**. -If you aren’t using dbt Cloud, that’s okay! We don’t get a fancy visualization out of the box, but we can use the output from the dbt CLI to check our model times, and it’s a great opportunity to become familiar with that output. +If you aren’t using dbt Cloud, that’s okay! We don’t get a fancy visualization out of the box, but we can use the output from the dbt Core CLI to check our model times, and it’s a great opportunity to become familiar with that output. -### dbt CLI output +### dbt Core CLI output If you’ve ever run dbt, whether `build`, `test`, `run` or something else, you’ve seen some output like below. Let’s take a closer look at how to read this. -![CLI output from a dbt build command](/img/guides/best-practices/materializations/dbt-build-output.png) +![CLI output from a dbt build command](/img/best-practices/materializations/dbt-build-output.png) - There are two entries per model, the **start** of a model’s build and the **completion**, which will include **how long** the model took to run. The **type** of model is included as well. For example: diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-7-conclusion.md b/website/docs/best-practices/materializations/materializations-guide-7-conclusion.md similarity index 89% rename from website/docs/guides/best-practices/materializations/materializations-guide-7-conclusion.md rename to website/docs/best-practices/materializations/materializations-guide-7-conclusion.md index 119563b9a50..cd561716fe4 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-7-conclusion.md +++ b/website/docs/best-practices/materializations/materializations-guide-7-conclusion.md @@ -9,6 +9,6 @@ hoverSnippet: Read this conclusion to our guide on using materializations in dbt You're now following best practices in your project, and have optimized the materializations of your DAG. You’re equipped with the 3 main materializations that cover almost any analytics engineering situation! -There are more configs and materializations available, as well as specific materializations for certain platforms and adapters — and like everything with dbt, materializations are extensible, meaning you can create your own [custom materializations](/guides/advanced/creating-new-materializations) for your needs. So this is just the beginning of what you can do with these powerful configurations. +There are more configs and materializations available, as well as specific materializations for certain platforms and adapters — and like everything with dbt, materializations are extensible, meaning you can create your own [custom materializations](/guides/create-new-materializations) for your needs. So this is just the beginning of what you can do with these powerful configurations. For the vast majority of users and companies though, tables, views, and incremental models will handle everything you can throw at them. Develop your intuition and expertise for these materializations, and you’ll be well on your way to tackling advanced analytics engineering problems. diff --git a/website/docs/community/resources/getting-help.md b/website/docs/community/resources/getting-help.md index 5f423683014..2f30644186e 100644 --- a/website/docs/community/resources/getting-help.md +++ b/website/docs/community/resources/getting-help.md @@ -7,9 +7,9 @@ dbt is open source, and has a generous community behind it. Asking questions wel ### 1. Try to solve your problem first before asking for help #### Search the existing documentation -The docs site you're on is highly searchable, make sure to explore for the answer here as a first step. If you're new to dbt, try working through the [quickstart guide](/quickstarts) first to get a firm foundation on the essential concepts. +The docs site you're on is highly searchable, make sure to explore for the answer here as a first step. If you're new to dbt, try working through the [quickstart guide](/guides) first to get a firm foundation on the essential concepts. #### Try to debug the issue yourself -We have a handy guide on [debugging errors](/guides/best-practices/debugging-errors) to help out! This guide also helps explain why errors occur, and which docs you might need to search for help. +We have a handy guide on [debugging errors](/guides/debug-errors) to help out! This guide also helps explain why errors occur, and which docs you might need to search for help. #### Search for answers using your favorite search engine We're committed to making more errors searchable, so it's worth checking if there's a solution already out there! Further, some errors related to installing dbt, the SQL in your models, or getting YAML right, are errors that are not-specific to dbt, so there may be other resources to check. @@ -60,4 +60,4 @@ If you want to receive dbt training, check out our [dbt Learn](https://learn.get - Billing - Bug reports related to the web interface -As a rule of thumb, if you are using dbt Cloud, but your problem is related to code within your dbt project, then please follow the above process rather than reaching out to support. \ No newline at end of file +As a rule of thumb, if you are using dbt Cloud, but your problem is related to code within your dbt project, then please follow the above process rather than reaching out to support. diff --git a/website/docs/community/resources/jobs-terms-and-conditions.md b/website/docs/community/resources/jobs-terms-and-conditions.md new file mode 100644 index 00000000000..f2f2134f847 --- /dev/null +++ b/website/docs/community/resources/jobs-terms-and-conditions.md @@ -0,0 +1,16 @@ +--- +title: "dbt Labs Community #jobs Channels Terms and Conditions" +id: "jobs-terms-and-conditions" +description: "Before posting a job in the dbt Community or submitting an application, review these terms and conditions." +--- + +I agree to abide by the [dbt Community Code of Conduct](community/resources/code-of-conduct) and all laws applicable to me in my use of the dbt Community's #jobs channels. I further agree: + +- dbt Labs is not responsible for not does it warrant or guarantee the validity, accuracy, completeness, legality, or reliability of any functionality of any #jobs channel, any posting's content, or any application and/or solicitation of any kind of employment. +- dbt Labs does not review and approve job-related content. +- dbt Labs disclaims liability of any kind whatsoever for any type of damage that occurs while using the community Slack for job-related reasons, and I waive any type of claim (including actual, special or consequential damages) to the maximum extent permitted by law. +- Without limitation, dbt Labs disclaims liability for quality, performance, merchantability, and fitness for a particular purpose, express or implied, that may arise out of my use of the community Slack for job-related content, my reliance on such information, and/or my provision/receipt of job-related information. +- I understand that no internet-based site is without risk, and my use is at my own risk. +- My use of any job-posting template (or other forum for providing job-related information) confirms my consent to provide the data posted, confirms that I have permission to post such data, and is subject to the terms of the [dbt Labs privacy policy](https://www.getdbt.com/cloud/privacy-policy). + +For further information, please contact [legal@dbtlabs.com](mailto:legal@dbtlabs.com). diff --git a/website/docs/community/resources/oss-expectations.md b/website/docs/community/resources/oss-expectations.md index 7bcc79cac9e..9c916de1240 100644 --- a/website/docs/community/resources/oss-expectations.md +++ b/website/docs/community/resources/oss-expectations.md @@ -4,7 +4,7 @@ title: "Expectations for OSS contributors" Whether it's a dbt package, a plugin, `dbt-core`, or this very documentation site, contributing to the open source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and to give back to the community. The goal of this page is to help you understand what to expect when contributing to dbt open source software (OSS). While we can only speak for our own experience as open source maintainers, many of these guidelines apply when contributing to other open source projects, too. -Have you seen things in other OSS projects that you quite like, and think we could learn from? [Open a discussion on the Developer Hub](https://github.com/dbt-labs/docs.getdbt.com/discussions/new), or start a conversation in the dbt Community Slack (for example: `#community-strategy`, `#dbt-core-development`, `#package-ecosystem`, `#adapter-ecosystem`). We always appreciate hearing from you! +Have you seen things in other OSS projects that you quite like, and think we could learn from? [Open a discussion on the dbt Community Forum](https://discourse.getdbt.com), or start a conversation in the dbt Community Slack (for example: `#community-strategy`, `#dbt-core-development`, `#package-ecosystem`, `#adapter-ecosystem`). We always appreciate hearing from you! ## Principles @@ -51,7 +51,7 @@ An issue could be a bug you’ve identified while using the product or reading t ### Best practices for issues -- Issues are **not** for support / troubleshooting / debugging help. Please [open a discussion on the Developer Hub](https://github.com/dbt-labs/docs.getdbt.com/discussions/new), so other future users can find and read proposed solutions. If you need help formulating your question, you can post in the `#advice-dbt-help` channel in the [dbt Community Slack](https://www.getdbt.com/community/). +- Issues are **not** for support / troubleshooting / debugging help. Please [open a discussion on the dbt Community Forum](https://discourse.getdbt.com), so other future users can find and read proposed solutions. If you need help formulating your question, you can post in the `#advice-dbt-help` channel in the [dbt Community Slack](https://www.getdbt.com/community/). - Always search existing issues first, to see if someone else had the same idea / found the same bug you did. - Many repositories offer templates for creating issues, such as when reporting a bug or requesting a new feature. If available, please select the relevant template and fill it out to the best of your ability. This will help other people understand your issue and respond. @@ -82,8 +82,8 @@ In some cases, the right resolution to an open issue might be tangential to the | `triage` | This is a new issue which has not yet been reviewed by a maintainer. This label is removed when a maintainer reviews and responds to the issue. | | `bug` | This issue represents a defect or regression from the behavior that's documented, or that you reasonably expect | | `enhancement` | This issue represents net-new functionality, including an extension of an existing capability | -| `good first issue` | This issue does not require deep knowledge of the codebase to implement. This issue is appropriate for a first-time contributor. | -| `help wanted` | This issue is trickier than a "good first issue." The required changes are scattered across the codebase, or more difficult to test. The maintainers are happy to help an experienced community contributor; they aren't planning to prioritize this issue themselves. | +| `good_first_issue` | This issue does not require deep knowledge of the codebase to implement. This issue is appropriate for a first-time contributor. | +| `help_wanted` | This issue is trickier than a "good first issue." The required changes are scattered across the codebase, or more difficult to test. The maintainers are happy to help an experienced community contributor; they aren't planning to prioritize this issue themselves. | | `duplicate` | This issue is functionally identical to another open issue. The maintainers will close this issue and encourage community members to focus conversation on the other one. | | `stale` | This is an old issue which has not recently been updated. In repositories with a lot of activity, stale issues will periodically be closed. | | `wontfix` | This issue does not require a code change in the repository, or the maintainers are unwilling to merge a change which implements the proposed behavior. | diff --git a/website/docs/community/resources/viewpoint.md b/website/docs/community/resources/viewpoint.md index e159c6178a3..5c3f80555c5 100644 --- a/website/docs/community/resources/viewpoint.md +++ b/website/docs/community/resources/viewpoint.md @@ -7,7 +7,7 @@ id: "viewpoint" In 2015-2016, a team of folks at RJMetrics had the opportunity to observe, and participate in, a significant evolution of the analytics ecosystem. The seeds of dbt were conceived in this environment, and the viewpoint below was written to reflect what we had learned and how we believed the world should be different. **dbt is our attempt to address the workflow challenges we observed, and as such, this viewpoint is the most foundational statement of the dbt project's goals.** -The remainder of this document is largely unedited from [the original post](https://blog.getdbt.com/building-a-mature-analytics-workflow/). +The remainder of this document is largely unedited from [the original post](https://getdbt.com/blog/building-a-mature-analytics-workflow). ::: diff --git a/website/docs/community/spotlight/alan-cruickshank.md b/website/docs/community/spotlight/alan-cruickshank.md new file mode 100644 index 00000000000..74ef95a2b61 --- /dev/null +++ b/website/docs/community/spotlight/alan-cruickshank.md @@ -0,0 +1,43 @@ +--- +id: alan-cruickshank +title: Alan Cruickshank +description: | + I've been around in the dbt community, especially the London dbt Meetup, since early 2019—around the time that we started using dbt at tails.com. My background is the startup/scaleup space and building data teams in a context where there is a lot of growth going on but there isn't a lot of money around to support that. That's a topic that I've written and spoken about on several occasions on podcasts, blogposts and even at Coalesce 2020 and 2021! + + Aside from my work at tails.com, my other main focus at the moment is SQLFluff, the open source SQL linter which I started developing as part of a hackday at tails.com in late 2019 and now is the most starred SQL linter on Github with almost 1M downloads a month. +image: /img/community/spotlight/alan-cruickshank.jpg +pronouns: he/him +location: London, UK +jobTitle: Insights Director +companyName: tails.com +organization: Author & Maintainer of SQLFluff +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/amcruickshank/ + - name: SQLFluff + link: https://sqlfluff.com +dateCreated: 2023-06-30 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I [joined the community](https://www.getdbt.com/community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) in 2019 and it's been an invaluable source of advice and wisdom, especially operating on the bleeding edge of open source data tooling. It's been a place to meet like-minded people, even find new colleagues and certainly one of the places I look to when thinking about how to approach hairy data problems. + +In London it's also been one of the most vibrant meetup groups in person, compared to many others which are either very, very specialized or more focussed on larger organisations. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I just want to be useful 😁. I've learned a lot from the community over the years, and now I want to be able to give back to it. My primary vehicle for that is SQLFluff - both as a tool for the community to use, but also as a way of encouraging a wider group of people to feel welcome and able to contribute to open source software and build the tools of the future. + +I also see SQLFluff as a vehicle to drive more consistency in the way we write SQL, and through that drive better communication and lower the barrier for new people to enter this field and find their own success. + +## What have you learned from community members? What do you hope others can learn from you? + +For better or worse, I spend most of my day job on people and organisational things, less on how to solve individual problems, and more on how to enable and support groups of people in being able to make great decisions themselves. In some ways, if I have to touch the keyboard too much, it's a sign that I've failed in that calling. dbt itself is a tool which enables better collaboration—and the community is full of people with great ideas on how to better enable other people around us. I hope that I'm able to pass some of that knowledge and the experience of applying it in a scaleup environment back to others also treading this path. + +More specifically from the dbt community, if I were to pick one recommendation, it would be Emilie Schario’s talk from Coalesce 2022 on [“Data Led is Dumb”](https://www.youtube.com/watch?v=WsMHPALc8Vg&t=1s). I think should be essential watching for anyone who’s hearing “Data Led” a lot, and wants to turn that excitement into practical action. + +## Anything else interesting you want to tell us? + +If you're not using SQLFluff on your dbt project, you probably should be: https://github.com/sqlfluff/sqlfluff diff --git a/website/docs/community/spotlight/alison-stanton.md b/website/docs/community/spotlight/alison-stanton.md new file mode 100644 index 00000000000..ffa5e8499c7 --- /dev/null +++ b/website/docs/community/spotlight/alison-stanton.md @@ -0,0 +1,84 @@ +--- +id: alison-stanton +title: Alison Stanton +description: | + I started programming 20+ years ago. I moved from web applications into transforming data and business intelligence reporting because it's both hard and useful. The majority of my career has been engineering for SaaS companies. For my last few positions I've been brought in to transition larger, older companies to a modern data platform and ways of thinking. + + I am dbt Certified. I attend Coalesce and other dbt events virtually. I speak up in dbt Slack and on the dbt-core, dbt-redshift, and dbt-sqlserver repositories. dbt Slack is my happy place, especially #advice-for-dbt-power-users. I care a lot about the dbt documentation and dbt doc. +image: /img/community/spotlight/alison.jpg +pronouns: she/her +location: Chicago, IL, USA +jobTitle: AVP, Analytics Engineering Lead +organization: Advocates for SOGIE Data Collection +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/alisonstanton/ + - name: Github + link: https://github.com/alison985/ +dateCreated: 2023-11-07 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community when I joined an employer in mid-2020. To summarize the important things that dbt has given me: it allowed me to focus on the next set of data challenges instead of staying in toil. Data folks joke that we're plumbers, but we're digital plumbers and that distinction should enable us to be DRY. That means not only writing DRY code like dbt allows, but also having tooling automation to DRY up repetitive tasks like dbt provides. + +dbt's existence flipped the experience of data testing on it's head for me. I went from a)years of instigating tech discussions on how to systematize data quality checks and b) building my own SQL tests and design patterns, to having built-in mechanisms for data testing. + +dbt and the dbt community materials are assets I can use in order to provide validation for things I have, do, and will say about data. Having outside voices to point to when requesting investment in data up-front - to avoid problems later - is an under-appreciated tool for data leader's toolboxes. + +dbt's community has given me access to both a) high-quality, seasoned SMEs in my field to learn from and b) newer folks I can help. Both are gifts that I cherish. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I want to be when I grow up: + +MJ, who was the first person to ever say "data build tool" to me. If I'd listened to her then I could have been part of the dbt community years sooner. + +Christine Dixon who presented "Could You Defend Your Data in Court?" at Coalesce 2023. In your entire data career, that is the most important piece of education you'll get. + +The dbt community team in general. Hands-down the most important work they do is the dbt Slack community, which gives me and others the accessibility we need to participate. Gwen Windflower (Winnie) for her extraordinary ability to bridge technical nuance with business needs on-the-fly. Dave Connors for being the first voice for "a node is a node is a node". Joel Labes for creating the ability to emoji-react with :sparkles: to post to the #best-of-slack channel. And so on. The decision to foster a space for data instead of just for their product because that enhances their product. The extremely impressive ability to maintain a problem-solving-is-cool, participate-as-you-can, chorus-of-voices, international, not-only-cis-men, and we're-all-in-this-together community. + +Other (all?) dbt labs employees who engage with the community, instead of having a false separation with it - like most software companies. Welcoming feedback, listening to it, and actioning or filtering it out (ex. Mirna Wong, account reps). Thinking holistically about the eco-system not just one feature at a time (ex. Anders). Responsiveness and ability to translate diverse items into technical clarity and focused actions (ex. Doug Beatty, the dbt support team). I've been in software and open source and online communities for a long time - these are rare things we should not take for granted. + +Josh Devlin for prolificness that demonstrates expertise and dedication to helping. + +The maintainers of dbt packages like dbt-utils, dbt-expectations, dbt-date, etc. + +Everyone who gets over their fear to ask a question, propose an answer that may not work, or otherwise take a risk by sharing their voice. + +I hope I can support my employer and my professional development and my dbt community through the following: +-Elevate dbt understanding of and support for Enterprise-size company use cases through dialogue, requests, and examples. +-Emphasize rigor with defensive coding and comprehensive testing practices. +-Improve the onboarding and up-skilling of dbt engineers through feedback and edits on docs.getdbt.com. +-Contribute to the maintenance of a collaborative and helpful dbt community as the number of dbt practitioners reaches various growth stages and tipping points. +-Engage in dialogue. Providing feedback. Champion developer experience as a priority. Be a good open source citizen on Github. + +## What have you learned from community members? What do you hope others can learn from you? + +I have learned: + +Details on DAG sequencing. +How to make an engineering proposal a community conversation. +The dbt semantic layer +. +So many things that are now so engrained in me that I can't remember not knowing them. + +I can teach and share about: + +Naming new concepts and how to choose those names. +Reproducibility, reconciliation, and audits. +Data ethics. +Demographic questions for sexual orientation and/or gender identity on a form. I'm happy to be your shortcut to the most complicated data and most-engrained tech debt in history. +I also geek out talking about: reusing functionality in creative ways, balancing trade-offs in data schema modeling, dealing with all of an organization's data holistically, tracking instrumentation, and philosophy on prioritization. + +The next things on my agenda to learn about: + +Successes and failures in data literacy work. The best I've found so far is 1:1 interactions and that doesn't scale. +How to reduce the amount of time running dbt test takes while maintaining coverage. +Data ethics. +The things you think are most important by giving them a :sparkles: emoji reaction in Slack. + +## Anything else interesting you want to tell us? + +My gratitude to each community member for this community. diff --git a/website/docs/community/spotlight/bruno-de-lima.md b/website/docs/community/spotlight/bruno-de-lima.md index 7f40f66859c..0365ee6c6a8 100644 --- a/website/docs/community/spotlight/bruno-de-lima.md +++ b/website/docs/community/spotlight/bruno-de-lima.md @@ -2,11 +2,11 @@ id: bruno-de-lima title: Bruno de Lima description: | - I am an Analytics Engineer and aspiring tech writer coming from an academic engineering background. + Hi all! I'm a Data Engineer, deeply fascinated by the awesomeness dbt. I love talking about dbt, creating content from daily tips to blogposts and engaging with this vibrant community! - I worked at Indicium as an Analytics Engineer for more than a year, having worked with dbt (of course, every day) for transformation; BigQuery, Snowflake, and Databricks as data warehouses; Power BI and Tableau for BI; and Airflow for orchestration. + Started my career at the beginning of 2022 at Indicium as an Analytics Engineer, working with dbt from day 1. By 2023, my path took a global trajectory as I joined phData as a Data Engineer, expanding my experiences and forging connections beyond Brazil. While dbt is at the heart of my expertise, I've also delved into data warehouses such as Snowflake, Databricks, and BigQuery; visualization tools like Power BI and Tableau; and several minor modern data stack tools. - I actively participate in the dbt community, having attended two dbt meetups in Brazil organized by Indicium; writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the code; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! + I actively participate in the dbt community, having attended two dbt Meetups in Brazil organized by Indicium; writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the code; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! image: /img/community/spotlight/bruno-de-lima.jpg pronouns: he/him location: Florianópolis, Brazil @@ -18,7 +18,7 @@ socialLinks: link: https://www.linkedin.com/in/brunoszdl/ - name: Medium link: https://medium.com/@bruno.szdl -dateCreated: 2023-03-28 +dateCreated: 2023-11-05 hide_table_of_contents: true --- @@ -30,7 +30,7 @@ It took me some time to become an active member of the dbt community. I started Inspired by other members, especially Josh Devlin and Owen Prough, I began answering questions on Slack and Discourse. For questions I couldn't answer, I would try engaging in discussions about possible solutions or provide useful links. I also started posting dbt tips on LinkedIn to help practitioners learn about new features or to refresh their memories about existing ones. -By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers. +By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, Discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers, including the one for working at phData. Thanks to the dbt community, I went from feeling uncertain about my career prospects to having a solid career and being surrounded by incredible people. diff --git a/website/docs/community/spotlight/dakota-kelley.md b/website/docs/community/spotlight/dakota-kelley.md new file mode 100644 index 00000000000..57834d9cdff --- /dev/null +++ b/website/docs/community/spotlight/dakota-kelley.md @@ -0,0 +1,30 @@ +--- +id: dakota-kelley +title: Dakota Kelley +description: | + For the last ~2 years I've worked at phData. Before that I spent 8 years working as a Software Developer in the public sector. Currently I'm a Solution Architect, helping our customers and clients implement dbt on Snowflake, working across multiple cloud providers. + + I first started reading about dbt when I was in grad school about 3 years ago. When I began with phData I had a fantastic opportunity to work with dbt. From there I feel in love with the Engineering practices and structure that I always felt were missing from Data Work. Since then, I've been fortunate enough to speak at Coalesce 2022 and at Coalesce 2023. On top of this, I've written numerous blogs about dbt as well. +image: /img/community/spotlight/dakota.jpg +pronouns: he/him +location: Edmond, USA +jobTitle: Solution Architect +companyName: phData +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/dakota-kelley/ +dateCreated: 2023-11-08 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt Community not too long after my first working experience. One of my passions is giving back and helping others, and being a part of the community allows me to help others with problems I've tackled before. Along the way it helps me learn new ways and see different methods to solve a wide variety of problems. Every time I interact with the community I've learned something new and that energizes me. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +This is a tough one. I know there are several, but the main qualities I resonate with are from those who dig in and help each other. There are always nuances to others situations, and it's good to dig in together, understand those, and seek a solution. The other quality I look for is someone who is trying to pull others up with them. At the end of the day we should all be striving to make all things better than they were when we arrived, regardless if that's the dbt Community or the local park we visit for rest and relaxation. + +## What have you learned from community members? What do you hope others can learn from you? + +The thing I hope others take away from me, is to genuinely support others and tackle problems with curiosity. There used to be a time where I was always worried about being wrong, so I wouldn't get too involved. It's okay to be wrong, that's how we learn new ways to handle problems and find new ways to grow. We just all have to be open to learning and trying our best to help and support each other. diff --git a/website/docs/community/spotlight/fabiyi-opeyemi.md b/website/docs/community/spotlight/fabiyi-opeyemi.md new file mode 100644 index 00000000000..f67ff4aaefc --- /dev/null +++ b/website/docs/community/spotlight/fabiyi-opeyemi.md @@ -0,0 +1,41 @@ +--- +id: fabiyi-opeyemi +title: Opeyemi Fabiyi +description: | + I'm an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. I've also got a background in financial services and supply chain. I'm passionate about helping organizations to become data-driven and I majorly use dbt for data modeling, while the other aspect of the stack is largely dependent on the client infrastructure I'm working for, so I often say I'm tool-agnostic. 😀 + + I'm the founder of Nigeria's Young Data Professional Community. I'm also the organizer of the Lagos dbt Meetup which I started, and one of the organizers of the DataFest Africa Conference. I became an active member of the dbt Community in 2021 & spoke at Coalesce 2022. +image: /img/community/spotlight/fabiyi-opeyemi.jpg +pronouns: he/him +location: Lagos, Nigeria +jobTitle: Senior Analytics Engineer +companyName: Data Culture +organization: Young Data Professionals (YDP) +socialLinks: + - name: Twitter + link: https://twitter.com/Opiano_1 + - name: LinkedIn + link: https://www.linkedin.com/in/opeyemifabiyi/ +dateCreated: 2023-11-06 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the [dbt Slack community](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) in 2021, and it has been an experience getting to learn from thought leaders in the space and stay in touch with cutting-edge innovation in the data space. The community has helped me become a better engineer by reading different responses to questions on Slack, and seeing genuine support from community members help other members tackle and solve their difficult problems is inspiring and has allowed me to model my community (YDP & the Lagos dbt Meetup) through that lens. I randomly enter the dbt Slack daily to read and learn from different channels. I love the sense of community that resonates in the dbt Slack channel, and the good news is that I got my current role from the #jobs channel from a post from Data Culture Co-Founder. So you can stay glued to that page if you are looking for a job role. + +The dbt community greatly impacted my previous role as a one-person data team. The community became the team I didn't have, providing all the necessary support and guidance I needed to deliver great value for the company excellently, and my experience with the community was the inspiration for my Coalesce talk in 2022 on how to leverage the dbt community as a data team of one. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Many great leaders inspire me in the dbt community; Joel Labes for constantly interacting with new folks and providing that safe space for everyone to ask any question, no matter how dumb you may think your question may be. He will give a response that will solve your problem; Benn Stancil for his vast experience and how he communicates it well with humour in his Friday night Substack, a newsletter I look forward to, which helps me stay current with recent trends in the global data space. + +Both of them resonate with the kind of leader I want to grow in the dbt Community; to be vast, experienced and readily available to provide support and guidance and help people solve problems and grow their careers. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned how to show empathy as a data professional and be a great engineer from various best practices around working with data. I also want others to know that irrespective of their current level of expertise or maturity in their data career, they can make an impact by getting involved in the community and helping others grow. + +## Anything else interesting you want to tell us? + +Maybe, I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I'm a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss😃 your community engagement efforts, please feel free to reach out. diff --git a/website/docs/community/spotlight/faith-lierheimer.md b/website/docs/community/spotlight/faith-lierheimer.md new file mode 100644 index 00000000000..3edb839bb1d --- /dev/null +++ b/website/docs/community/spotlight/faith-lierheimer.md @@ -0,0 +1,47 @@ +--- +id: faith-lierheimer +title: Faith Lierheimer +description: | + I've been a dbt Community member for around a year and a half. I come to the data world from teaching and academic research. Working in data fuses the aspects of those careers that I like the most, which are technical problem solving, and helping non-technical audiences understand data and what they can do with it. I have a dream stack with Databricks, dbt, and Looker. + + Professionally, I help shippers of perishable goods (everything from blueberries to childhood vaccinations) understand the risks their goods face in transit and how to mitigate them.This reduces food and medical waste worldwide. + + You can read more about these interests at faithfacts.substack.com. +image: /img/community/spotlight/faith-lierheimer.jpg +pronouns: she/her +location: Denver, CO, USA +jobTitle: Data Analyst II +companyName: Parsyl +organization: Data Angels +socialLinks: + - name: Twitter + link: https://twitter.com/FaithLierheimer + - name: LinkedIn + link: https://www.linkedin.com/in/faithlierheimer/ + - name: Substack + link: https://faithfacts.substack.com/ + - name: Data Folks + link: https://data-folks.masto.host/@faithlierheimer +dateCreated: 2023-06-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community early in 2022 hoping to find technical help with dbt, and instead found a wide support network of career-minded data professionals. Being in the dbt community has helped me find my niche in the data world, and has helped me discover ways I can grow my career and technical acumen. Being in this community has been huge in easing my career transition from teaching into data. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I primarily conceptualize of leadership as raising the floor beneath everyone, rather than enabling a few to touch its vaulted ceiling. As I gain more experience, I'd be delighted to be a resource for fellow career changers and teachers in transition. + +And, I love to goof in #roast-my-graph in the dbt Slack. [Come join](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) that channel, it's a hoot and a holler. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned a lot from community members, but most notably and concretely, I've actually gotten excellent visualization advice in #roast-my-graph. I've taken graphs there several times where I felt stuck on the presentation and have learned a lot about effective vizzes from my peers there. + +As I continue to gain experience, I hope others can learn from me what a successful career change looks like. And, ultimately, to take the work seriously but to not take ourselves that seriously. + +## Anything else interesting you want to tell us? + +I have a black cat with one eye named Gus and my purpose is now to give him the best existence possible. diff --git a/website/docs/community/spotlight/jing-yu-lim.md b/website/docs/community/spotlight/jing-yu-lim.md new file mode 100644 index 00000000000..a3d1784293f --- /dev/null +++ b/website/docs/community/spotlight/jing-yu-lim.md @@ -0,0 +1,41 @@ +--- +id: jing-yu-lim +title: Jing Yu Lim +description: | + For ~3 years, I was a Product Analyst at Grab, a ride-hailing and food delivery app in Southeast Asia, before taking on an Analytics Engineering role in Spenmo, a B2B Fintech startup. I joined a tech company as an analyst in June 2023, but was recently impacted by a layoff. I'm also one of the co-organisers of the Singapore dbt Meetup! + + My story with dbt started in Jan 2022, when I joined Spenmo where I taught myself dbt, mainly via dbt's documentation and Slack community. We used Snowflake as our data warehouse, and Holistics for BI. I spoke about data self-serve and Spenmo's journey with dbt at multiple meetups. +image: /img/community/spotlight/jing-lim.jpg +pronouns: she/her +location: Singapore, Singapore +jobTitle: I'm open to work! +companyName: "" +organization: "" +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/limjingyu/ +dateCreated: 2023-07-01 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community late January 2022, while setting up Spenmo's first dbt project. I was completely new to dbt, and relied heavily on the #advice-dbt-help channel in dbt Slack whenever I got stuck. I have learnt so much from reading discussions in other channels as well (e.g. #leading-data-teams, #advice-mock-interviews, #db-snowflake, #tools-holistics). + +The dbt community also helped me expand my professional network, where I met so many amazing individuals! It all started with #local-singapore which was created by community member Jolanda Zwagemaker sometime in April 2022. We organised dinners to connect with one another, which eventually led to an opportunity to run Singapore dbt Meetup (HUGE thank you to dbt) - it is heartwarming to see connections forged between many attendees of the meetup, where we also learn from one another. It really does feel like a community! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Claire Carroll and Mila Page! My very first touchpoint with dbt was their articles in [The Analytics Engineering Guide](https://www.getdbt.com/analytics-engineering/). I remember relating to it so much that I was saying "YES" to every other line I read, and sending text snippets to my friends. + +To me, Analytics Engineering could help overcome certain challenges I face as an analyst, and make the job feels less like a "hamster wheel." As the concept of analytics engineering is fairly new in Singapore, I feel the need to spread the word and bring about a mindset shift among not just data teams, but anyone who needs to work with a data team. + +## What have you learned from community members? What do you hope others can learn from you? + +One of my favourite presentations from the Singapore dbt Meetup was ["How would the ideal Semantic Layer look like?"](https://docs.google.com/presentation/d/1t1ts04b7qA-BVlV3qbNZ4fI-MSZn0iL6_FhsaWhJk_0/edit?usp=sharing ) by fellow community member Thanh Dinh from Holistics. It taught me a new perspective on metrics: they could be like dbt models, where dependencies can be set up between metric models. + +I definitely have so much more to learn as an individual, but I hope to share some of my tips and lessons in terms of data modelling with others. + +## Anything else interesting you want to tell us? + +Thank you dbt for enabling us to run meetups! It has been critical for ensuring a great experience for the Singapore community. Also a huge shoutout to Amada, the Global Community Development Lead, for always being super helpful and supportive despite the 12-hour time difference! diff --git a/website/docs/community/spotlight/josh-devlin.md b/website/docs/community/spotlight/josh-devlin.md new file mode 100644 index 00000000000..d8a9b91c282 --- /dev/null +++ b/website/docs/community/spotlight/josh-devlin.md @@ -0,0 +1,42 @@ +--- +id: josh-devlin +title: Josh Devlin +description: | + Josh Devlin has a rich history of community involvement and technical expertise in both the dbt and wider analytics communities. + + Discovering dbt in early 2020, he quickly became an integral member of its community, leveraging the platform as a learning tool and aiding others along their dbt journey. Josh has helped thousands of dbt users with his advice and near-encyclopaedic knowledge of dbt. + + Beyond the online community, he transitioned from being an attendee at the first virtual Coalesce conference in December 2020 to a presenter at the first in-person Coalesce event in New Orleans in 2022. He has also contributed to the dbt-core and dbt-snowflake codebases, helping improve the product in the most direct way. + + His continuous contributions echo his philosophy of learning through teaching, a principle that has not only enriched the dbt community but also significantly bolstered his proficiency with the tool, making him a valuable community member. + + Aside from his technical endeavors, Josh carries a heart for communal growth and an individual's ability to contribute to a larger whole, a trait mirrored in his earlier pursuits as an orchestral musician. His story is a blend of technical acumen, communal involvement, and a nuanced appreciation for the symbiotic relationship between teaching and learning, making him a notable figure in the analytics engineering space. +image: /img/community/spotlight/josh-devlin.jpg +pronouns: he/him +location: Melbourne, Australia (but spent most of the last decade in Houston, USA) +jobTitle: Senior Analytics Engineer +companyName: Canva +socialLinks: + - name: Twitter + link: https://twitter.com/JayPeeDevlin + - name: LinkedIn + link: https://www.linkedin.com/in/josh-devlin/ +dateCreated: 2023-11-10 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I have been a subscriber to 'The Data Science Roundup' (now ['The Analytics Engineering Roundup'](https://roundup.getdbt.com/)) since its inception, so I knew that dbt existed from the very beginning, since the time that dbt Labs was still called Fishtown Analytics. Despite that, I never really understood what the tool was or how it fit in until early 2020 when I first started experimenting with the tool. I immediately joined the community and found it warm and welcoming, so I started to help people where I could and never stopped! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I like to think I represent the warm, helpful vibes of the early days of the Community, where folks like Claire Carroll warmly welcomed myself and others! + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned that the more you give, the more you get. I've put hundreds of hours into helping other people in the community, but I've gotten all that back and much more. I hope I can encourage others to give of themselves and reap the rewards later! + +## Anything else interesting you want to tell us? + +In a previous life I was an orchestral musician! diff --git a/website/docs/community/spotlight/karen-hsieh.md b/website/docs/community/spotlight/karen-hsieh.md index 1a5cc8c4788..5147f39ce59 100644 --- a/website/docs/community/spotlight/karen-hsieh.md +++ b/website/docs/community/spotlight/karen-hsieh.md @@ -12,7 +12,7 @@ description: | image: /img/community/spotlight/karen-hsieh.jpg pronouns: she/her location: Taipei, Taiwan -jobTitle: Director of Product & Data +jobTitle: Director of Tech & Data companyName: ALPHA Camp organization: "" socialLinks: @@ -22,7 +22,7 @@ socialLinks: link: https://www.linkedin.com/in/karenhsieh/ - name: Medium link: https://medium.com/@ijacwei -dateCreated: 2023-03-24 +dateCreated: 2023-11-04 hide_table_of_contents: true --- diff --git a/website/docs/community/spotlight/oliver-cramer.md b/website/docs/community/spotlight/oliver-cramer.md new file mode 100644 index 00000000000..bfd62db0908 --- /dev/null +++ b/website/docs/community/spotlight/oliver-cramer.md @@ -0,0 +1,35 @@ +--- +id: oliver-cramer +title: Oliver Cramer +description: | + When I joined Aquila Capital in early 2022, I had the ModernDataStack with SqlDBM, dbt & Snowflake available. During the first half year I joined the dbt community. I have been working in the business intelligence field for many years. In 2006 I founded the first TDWI Roudtable in the DACH region. I often speak at conferences, such as the Snowflake Summit and the German TDWI conference. + I have been very involved in the data vault community for over 20 years and I do a lot of work with dbt Labs’ Sean McIntyre and Victoria Mola to promote Data Vault in EMEA. I have even travelled to Canada and China to meet data vault community members! Currently I have a group looking at the Data Vault dbt packages. The German Data Vault User Group (DDVUG) has published a sample database to test Data Warehouse Automation tools. + In addition, I founded the Analytics Engineering Northern Germany Meetup Group, which will transition into an official dbt Meetup, the Northern Germany dbt Meetup. +image: /img/community/spotlight/oliver.jpg +pronouns: he/him +location: Celle, Germany +jobTitle: Lead Data Warehouse Architect +companyName: Aquila Capital +organization: TDWI Germany +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/oliver-cramer/ +dateCreated: 2023-11-02 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community in 2022. My current focus is on building modern data teams. There is no magic formula for structuring your analytics function. Given the pace of technological change in our industry, the structure of a data team must evolve over time. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I like working with dbt Labs' Sean McIntyre to promote Data Vault in Europe and Victoria Perez Mola, also from dbt Labs, is always a great help when I have questions about dbt. + +## What have you learned from community members? What do you hope others can learn from you? + +I just think it's good to have a community, to be able to ask questions and get good answers. + +## Anything else interesting you want to tell us? + +Data Vault is actively looking forward to supporting the messaging that dbt Cloud (+packages) is a real alternative that works. diff --git a/website/docs/community/spotlight/owen-prough.md b/website/docs/community/spotlight/owen-prough.md new file mode 100644 index 00000000000..cc8ce37221e --- /dev/null +++ b/website/docs/community/spotlight/owen-prough.md @@ -0,0 +1,41 @@ +--- +id: owen-prough +title: Owen Prough +description: | + Well met, data adventurer! My professional data history is mostly USA healthcare-related (shout out to ANSI X12 claim files) while working with large (10k+ employee) software companies and small (but growing!) startups. My constant companion for the last decade has been SQL of various flavors https://xkcd.com/927/, and these days I mostly work with PostgreSQL, AWS Athena, and Snowflake. I think SQL is a great tool to solve interesting problems. + + Oh and also dbt. I haven't done anything too fancy with dbt, but I have contributed to the dbt-athena adapter and a few different packages. Mostly I lurk on Slack, cleverly disguised as a duck. It's a professional goal of mine to someday attend Coalesce. +image: /img/community/spotlight/owen-prough.jpg +pronouns: he/him +location: Milwaukee, USA +jobTitle: Data Engineer +companyName: Sift Healthcare +organization: "" +socialLinks: + - name: LinkedIn + link: https://linkedin.com/in/owen-prough +dateCreated: 2023-06-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I committed dbt_project.yml to the company git repo in July 2021 so I've been hanging out with all of you for about 2 years. What I love the most about dbt is how easy it is to write data tests. Writing data tests without dbt was painful, but now with all the tests we have in dbt I have a dramatically improved confidence in our data quality. + +The wider dbt community is also a reliable and constant source of education. I only interact in a few Slack channels, but I read *many* Slack channels to see what others are doing in the Analytics Engineering space and to get ideas about how to improve the processes/pipelines at my company. Y'all are great. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +This is an interesting question. I think I most identify with or am inspired by [Josh Devlin](./josh-devlin), who seems to be everywhere on Slack and very knowledgeable/helpful. I also want to know things and pay it forward. + +Also shout out to [Faith Lierheimer](./faith-lierheimer), whose contributions to [#roast-my-graph](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) always make me laugh and/or weep. + +## What have you learned from community members? What do you hope others can learn from you? + +The [public documentation for dbt](https://docs.getdbt.com/docs/introduction) is quite good. You should bookmark it and make it a personal goal to read through it all. There are a lot of cool things that dbt can do. + +Also I think it's really cool to see newcomers asking questions on Slack/[Discourse](https://discourse.getdbt.com/) and then see those same people answering others' questions. It speaks to the value we all get from dbt that folks want to give back to the community. + +## Anything else interesting you want to tell us? + +Did you notice how I avoided starting a sentence with "dbt"? That's because I know the standard is lowercase, but starting a sentence with a lowercase word looks weird to my eyes. diff --git a/website/docs/community/spotlight/sam-debruyn.md b/website/docs/community/spotlight/sam-debruyn.md new file mode 100644 index 00000000000..166adf58b09 --- /dev/null +++ b/website/docs/community/spotlight/sam-debruyn.md @@ -0,0 +1,37 @@ +--- +id: sam-debruyn +title: Sam Debruyn +description: | + I have a background of about 10 years in software engineering and moved to data engineering in 2020. Today, I lead dataroots's data & cloud unit on a technical level, allowing me to share knowledge and help multiple teams and customers, while still being hands-on every day. In 2021 and 2022, I did a lot of work on dbt-core and the dbt adapters for Microsoft SQL Server, Azure SQL, Azure Synapse, and now also Microsoft Fabric. I spoke at a few meetups and conferences about dbt and other technologies which I'm passionate about. Sharing knowledge is what drives me, so in 2023 I founded the Belgium dbt Meetup. Every meetup reached its maximum capacity ever since. +image: /img/community/spotlight/sam.jpg +pronouns: he/him +location: Heist-op-den-Berg, Belgium +jobTitle: Tech Lead Data & Cloud +companyName: dataroots +organization: "" +socialLinks: + - name: Twitter + link: https://twitter.com/s_debruyn + - name: LinkedIn + link: https://www.linkedin.com/in/samueldebruyn/ + - name: Blog + link: https://debruyn.dev/ +dateCreated: 2023-11-03 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt Community at the end of 2020, when we had dbt 0.18. At first, I was a bit suspicious. I thought to myself, how could a tool this simple make such a big difference? But after giving it a try, I was convinced: this is what we'll all be using for our data transformations in the future. dbt shines in its simplicity and very low learning curve. Thanks to dbt, a lot more people can become proficient in data analytics. I became a dbt evangelist, both at my job as well as in local and online data communities. I think that data holds the truth. And I think that the more people we can give access to work with data, so that they don't having to depend on others to work with complex tooling, the more we can achieve together. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +It's hard to pick one person. There are lots of folks who inspired me along the way. There is Anders Swanson (known as dataders on Github), with whom I've spent countless hours discussing how we can bring two things I like together: dbt and the Microsoft SQL products. It's amazing to look back on what we achieved now that dbt Labs and Microsoft are working together to bring dbt support for Fabric and Synapse. There is also Jeremy Cohen (jerco) whose lengthy GitHub discussions bring inspiration to how you can do even more with dbt and what the future might hold. Cor Zuurmond (JCZuurmond) inspired me to start contributing to dbt-core, adapters, and related packages. He did an impressive amount of work by making dbt-spark even better, building pytest integration for dbt, and of course by bringing dbt to world's most used database: dbt-excel. + +## What have you learned from community members? What do you hope others can learn from you? + +dbt doesn't only shine when you're using it, but also under the hood. dbt's codebase is very approachable and consistently well written with code that is clean, elegant, and easy to understand. When you're thinking about a potential feature, a bugfix, or building integrations with dbt, just go to Slack or Github and see what you can do to make that happen. You can contribute by discussing potential features, adding documentation, writing code, and more. You don't need to be a Python expert to get started. + +## Anything else interesting you want to tell us? + +The dbt community is one of the biggest data communities globally, but also the most welcoming one. It's amazing how nice, friendly, and approachable everyone is. It's great to be part of this community. diff --git a/website/docs/community/spotlight/stacy-lo.md b/website/docs/community/spotlight/stacy-lo.md new file mode 100644 index 00000000000..f0b70fcc225 --- /dev/null +++ b/website/docs/community/spotlight/stacy-lo.md @@ -0,0 +1,40 @@ +--- +id: stacy-lo +title: Stacy Lo +description: | + I began my career as a data analyst, then transitioned to a few different roles in data and software development. Analytics Engineer is the best title to describe my expertise in data. + + I’ve been in the dbt Community for almost a year. In April, I shared my experience adopting dbt at the Taipei dbt Meetup, which inspired me to write technical articles. + + In Taiwan, the annual "iThome Iron Man Contest" happens in September, where participants post a technical article written in Mandarin every day for 30 consecutive days. Since no one has ever written about dbt in the contest, I'd like to be the first person, and that’s what I have been busy with for in the past couple of months. +image: /img/community/spotlight/stacy.jpg +pronouns: she/her +location: Taipei, Taiwan +jobTitle: Senior IT Developer +companyName: Teamson +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/olycats/ +dateCreated: 2023-11-01 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined dbt Slack on November 2022. It was the time our company decided to use dbt in our data architecture, so I joined the #local-taipei channel in dbt Slack and introduced myself. To my surprise, I was immediately invited to share my experience at a Taipei dbt Meetup. I just joined the community, never joined any other meetups, did not know anyone there, and was very new to dbt. + +The biggest impact to my career is that I gained a lot of visibility! I got to know a lot of great data people, and now I have one meetup presentation recorded on YouTube, 30 technical articles on iThome Iron Man Contest, and now I am featured in the dbt Community Spotlight! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Karen Hsieh is the best! She not only brought me in to the dbt Community by way of the #local-taipei channel in dbt Slack, but she also encouraged me to contribute to the community in many ways, without making me feel pressured. With her passion and leading style, Karen successfully built a friendly and diverse group of people in #local-taipei. + +I’d also like to recommend Bruno de Lima's LinkedIn posts. His 'dbt Tips of the Day' effectively delivery knowledge in a user-friendly way. In addition, I really enjoyed the dbt exam practice polls. Learning dbt can be a challenge, but Bruno makes it both easy and fun! + +## What have you learned from community members? What do you hope others can learn from you? + +I learned that there are many ways to contribute to the community, regardless of our background or skill level. Everyone has something valuable to offer, and we should never be afraid to share. Let's find our own ways to make an impact! + +## Anything else interesting you want to tell us? + +Although the #local-taipei channel in dbt Slack is not made up of many, many people, we still managed to assemble a team of 7 people to join the Iron Man Contest. We produced a total of 200 articles in 30 days in topics around dbt and data. I don’t know how many people will find them useful, but it's definitely a great start to raising awareness of dbt in Taiwan. diff --git a/website/docs/community/spotlight/sydney-burns.md b/website/docs/community/spotlight/sydney-burns.md new file mode 100644 index 00000000000..ecebd6cdec0 --- /dev/null +++ b/website/docs/community/spotlight/sydney-burns.md @@ -0,0 +1,34 @@ +--- +id: sydney-burns +title: Sydney Burns +description: | + In 2019, I started as an analytics intern at a healthcare tech startup. I learned about dbt in 2020 and joined the community to self-teach. The following year, I started using dbt professionally as a consultant, and was able to pick up various parts of the stack and dive into different implementations. That experience empowered me to strike a better balance between "best practices" and what suits a specific team best. I also spoke at Coalesce 2022, a highlight of my career! + + Now, I collaborate with other data professionals at Webflow, where focused on enhancing and scaling our data operations. I strive to share the same enthusiasm, support, and knowledge with my team that I've gained from the broader community! +image: /img/community/spotlight/sydney.jpg +pronouns: she/her +location: Panama City, FL, USA +jobTitle: Senior Analytics Engineer +companyName: Webflow +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/sydneyeburns/ +dateCreated: 2023-11-09 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +The stack I used in my first data role was outdated and highly manual. Where I live, modern tech companies are few and far between, and I didn't have many in-person resources nor enough knowledge to realize that another world was possible at my skill level. I was thrilled to find a pocket of the Internet where similarly frustrated but creative data folks were sharing thoughtful solutions to problems I'd been struggling with! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Christine Berger was my first ever (best ever!) data colleague, and the one who first introduced me to dbt. + +There are certain qualities I've always valued in her, that I've found in many others across the community, and strive to cultivate in myself — earnestness, curiosity, creativity, and consistently doing good work with deep care. + +## What have you learned from community members? What do you hope others can learn from you? + +I spent too much time in my early career feeling scared to ask for help because I didn't want others to think I was incompetent. I'd spin my wheels on something for hours before finally asking someone to help me. + +The community has proven one thing to me time and time again: there are people here who will not only help you, but will be palpably *excited* to help you and share what they know, especially if it's clear you've made efforts to use your resources and try things on your own first. I'm one of those people now! diff --git a/website/docs/dbt-cli/cli-overview.md b/website/docs/dbt-cli/cli-overview.md deleted file mode 100644 index 3b96d4637bd..00000000000 --- a/website/docs/dbt-cli/cli-overview.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "CLI overview" -description: "Run your dbt project from the command line." ---- - -dbt Core ships with a command-line interface (CLI) for running your dbt project. The dbt CLI is free to use and available as an [open source project](https://github.com/dbt-labs/dbt-core). - -When using the command line, you can run commands and do other work from the current or _working directory_ on your computer. Before running the dbt project from the command line, make sure the working directory is your dbt project directory. For more details, see "[Creating a dbt project](/docs/build/projects)." - - - - -Once you verify your dbt project is your working directory, you can execute dbt commands. A full list of dbt commands can be found in the [reference section](/reference/dbt-commands). - - - -:::tip Pro tip: Using the --help flag - -Most command-line tools, including dbt, have a `--help` flag that you can use to show available commands and arguments. For example, you can use the `--help` flag with dbt in two ways: -• `dbt --help`: Lists the commands available for dbt -• `dbt run --help`: Lists the flags available for the `run` command - -::: - diff --git a/website/docs/docs/about-setup.md b/website/docs/docs/about-setup.md index 3fb868b8448..ceb34a5ccbb 100644 --- a/website/docs/docs/about-setup.md +++ b/website/docs/docs/about-setup.md @@ -3,11 +3,13 @@ title: About dbt setup id: about-setup description: "About setup of dbt Core and Cloud" sidebar_label: "About dbt setup" +pagination_next: "docs/environments-in-dbt" +pagination_prev: null --- dbt compiles and runs your analytics code against your data platform, enabling you and your team to collaborate on a single source of truth for metrics, insights, and business definitions. There are two options for deploying dbt: -**dbt Cloud** runs dbt Core in a hosted (single or multi-tenant) environment with a browser-based interface. The intuitive UI will aid you in setting up the various components. dbt Cloud comes equipped with turnkey support for scheduling jobs, CI/CD, hosting documentation, monitoring & alerting, and an integrated developer environment (IDE). +**dbt Cloud** runs dbt Core in a hosted (single or multi-tenant) environment with a browser-based interface. The intuitive user interface aids you in setting up the various components. dbt Cloud comes equipped with turnkey support for scheduling jobs, CI/CD, hosting documentation, monitoring, and alerting. It also offers an integrated development environment (IDE) and allows you to develop and run dbt commands from your local command line (CLI) or code editor. **dbt Core** is an open-source command line tool that can be installed locally in your environment, and communication with databases is facilitated through adapters. @@ -19,7 +21,7 @@ To begin configuring dbt now, select the option that is right for you. diff --git a/website/docs/docs/about/overview.md b/website/docs/docs/about/overview.md deleted file mode 100644 index e34866fa3fe..00000000000 --- a/website/docs/docs/about/overview.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: "What is dbt? " -id: "overview" ---- - -dbt is a productivity tool that helps analysts get more done and produce higher quality results. - -Analysts commonly spend 50-80% of their time modeling raw data—cleaning, reshaping, and applying fundamental business logic to it. dbt empowers analysts to do this work better and faster. - -dbt's primary interface is its CLI. Using dbt is a combination of editing code in a text editor and running that code using dbt from the command line using `dbt [command] [options]`. - -# How does dbt work? - -dbt has two core workflows: building data models and testing data models. (We call any transformed of raw data a data model.) - -To create a data model, an analyst simply writes a SQL `SELECT` statement. dbt then takes that statement and builds it in the database, materializing it as either a view or a . This model can then be queried by other models or by other analytics tools. - -To test a data model, an analyst asserts something to be true about the underlying data. For example, an analyst can assert that a certain field should never be null, should always hold unique values, or should always map to a field in another table. Analysts can also write assertions that express much more customized logic, such as “debits and credits should always be equal within a given journal entry”. dbt then tests all assertions against the database and returns success or failure responses. - -# Does dbt really help me get more done? - -One dbt user has this to say: *“At this point when I have a new question, I can answer it 10-100x faster than I could before.”* Here’s how: - -- dbt allows analysts to avoid writing boilerplate and : managing transactions, dropping tables, and managing schema changes. All business logic is expressed in SQL `SELECT` statements, and dbt takes care of . -- dbt creates leverage. Instead of starting at the raw data with every analysis, analysts instead build up reusable data models that can be referenced in subsequent work. -- dbt includes optimizations for data model materialization, allowing analysts to dramatically reduce the time their queries take to run. - -There are many other optimizations in the dbt to help you work quickly: macros, hooks, and package management are all accelerators. - -# Does dbt really help me produce more reliable analysis? - -It does. Here’s how: - -- Writing SQL frequently involves a lot of copy-paste, which leads to errors when logic changes. With dbt, analysts don’t need to copy-paste. Instead, they build reusable data models that then get pulled into subsequent models and analysis. Change a model once and everything that relies on it reflects that change. -- dbt allows subject matter experts to publish the canonical version of a particular data model, encapsulating all complex business logic. All analysis on top of this model will incorporate the same business logic without needing to understand it. -- dbt plays nicely with source control. Using dbt, analysts can use mature source control processes like branching, pull requests, and code reviews. -- dbt makes it easy and fast to write functional tests on the underlying data. Many analytic errors are caused by edge cases in the data: testing helps analysts find and handle those edge cases. - -# Why SQL? - -While there are a large number of great languages for manipulating data, we’ve chosen SQL as the primary [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) language at the heart of dbt. There are three reasons for this: - -1. SQL is a very widely-known language for working with data. Using SQL gives the largest-possible group of users access. -2. Modern analytic databases are extremely performant and have sophisticated optimizers. Writing data transformations in SQL allows users to describe transformations on their data but leave the execution plan to the underlying database technology. In practice, this provides excellent results with far less work on the part of the author. -3. SQL `SELECT` statements enjoy a built-in structure for describing dependencies: `FROM X` and `JOIN Y`. This results in less setup and maintenance overhead in ensuring that transforms execute in the correct order, compared to other languages and tools. - -# What databases does dbt currently support? - -See [Supported Data Platforms](/docs/supported-data-platforms) to view the full list of supported databases, warehouses, and query engines. - -# How do I get started? - -dbt is open source and completely free to download and use. See our [Getting Started guide](/docs/introduction) for more. diff --git a/website/docs/docs/build/about-metricflow.md b/website/docs/docs/build/about-metricflow.md index 2a5e750aea3..d76715c46a1 100644 --- a/website/docs/docs/build/about-metricflow.md +++ b/website/docs/docs/build/about-metricflow.md @@ -4,38 +4,38 @@ id: about-metricflow description: "Learn more about MetricFlow and its key concepts" sidebar_label: About MetricFlow tags: [Metrics, Semantic Layer] +pagination_next: "docs/build/join-logic" +pagination_prev: null --- -This guide introduces MetricFlow's fundamental ideas for new users. MetricFlow, which powers the dbt Semantic Layer, helps you define and manage the logic for your company's metrics. It's an opinionated set of abstractions and helps data consumers retrieve metric datasets from a data platform quickly and efficiently. +This guide introduces MetricFlow's fundamental ideas for people new to this feature. MetricFlow, which powers the dbt Semantic Layer, helps you define and manage the logic for your company's metrics. It's an opinionated set of abstractions and helps data consumers retrieve metric datasets from a data platform quickly and efficiently. -:::info +MetricFlow handles SQL query construction and defines the specification for dbt semantic models and metrics. It allows you to define metrics in your dbt project and query them with [MetricFlow commands](/docs/build/metricflow-commands) whether in dbt Cloud or dbt Core. -MetricFlow is a new way to define metrics in dbt and one of the key components of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer). It handles SQL query construction and defines the specification for dbt semantic models and metrics. +Before you start, consider the following guidelines: -To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). - -::: - -There are a few key principles: - -- **Flexible, but complete** — Ability to create any metric on any data model by defining logic in flexible abstractions. -- **Don't Repeat Yourself (DRY)** — Avoid repetition by allowing metric definitions to be enabled whenever possible. -- **Simple with progressive complexity** — Make MetricFlow approachable by relying on known concepts and structures in data modeling. -- **Performant and efficient** — Allow for performance optimizations in centralized data engineering while still enabling distributed definition and ownership of logic. +- Define metrics in YAML and query them using these [new metric specifications](https://github.com/dbt-labs/dbt-core/discussions/7456). +- You must be on [dbt version](/docs/dbt-versions/upgrade-core-in-cloud) 1.6 or higher to use MetricFlow. +- Use MetricFlow with Snowflake, BigQuery, Databricks, Postgres (dbt Core only), or Redshift. +- Discover insights and query your metrics using the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and its diverse range of [available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations). You must have a dbt Cloud account on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). ## MetricFlow -- MetricFlow is a SQL query generation engine that helps you create metrics by constructing appropriate queries for different granularities and dimensions that are useful for various business applications. +MetricFlow is a SQL query generation tool designed to streamline metric creation across different data dimensions for diverse business needs. +- It operates through YAML files, where a semantic graph links language to data. This graph comprises [semantic models](/docs/build/semantic-models) (data entry points) and [metrics](/docs/build/metrics-overview) (functions for creating quantitative indicators). +- MetricFlow is a [BSL package](https://github.com/dbt-labs/metricflow) with code source available, and compatible with dbt version 1.6 and higher. Data practitioners and enthusiasts are highly encouraged to contribute. +- As a part of the dbt Semantic Layer, MetricFlow empowers organizations to define metrics using YAML abstractions. +- To query metric dimensions, dimension values, and validate configurations, use [MetricFlow commands](/docs/build/metricflow-commands). -- It uses YAML files to define a semantic graph, which maps language to data. This graph consists of [semantic models](/docs/build/semantic-models), which serve as data entry points, and [metrics](/docs/build/metrics-overview), which are functions used to create new quantitative indicators. -- MetricFlow is a [BSL package](https://github.com/dbt-labs/metricflow) (code is source available) and available on dbt versions 1.6 and higher. Data practitioners and enthusiasts are highly encouraged to contribute. +**Note** — MetricFlow doesn't support dbt [builtin functions or packages](/reference/dbt-jinja-functions/builtins) at this time, however, support is planned for the future. -- MetricFlow, as a part of the dbt Semantic Layer, allows organizations to define company metrics logic through YAML abstractions, as described in the following sections. +MetricFlow abides by these principles: -- You can install MetricFlow using PyPI as an extension of your [dbt adapter](/docs/supported-data-platforms) in the CLI. To install the adapter, run `pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `pip install "dbt-metricflow[snowflake]"`. - -- To query metrics dimensions, dimension values, and validate your configurations; install the [MetricFlow CLI](/docs/build/metricflow-cli). +- **Flexibility with completeness**: Define metric logic using flexible abstractions on any data model. +- **DRY (Don't Repeat Yourself)**: Minimize redundancy by enabling metric definitions whenever possible. +- **Simplicity with gradual complexity:** Approach MetricFlow using familiar data modeling concepts. +- **Performance and efficiency**: Optimize performance while supporting centralized data engineering and distributed logic ownership. ### Semantic graph @@ -55,6 +55,7 @@ For a semantic model, there are three main pieces of metadata: * [Dimensions](/docs/build/dimensions) — These are the ways you want to group or slice/dice your metrics. * [Measures](/docs/build/measures) — The aggregation functions that give you a numeric result and can be used to create your metrics. + ### Metrics @@ -66,27 +67,24 @@ MetricFlow supports different metric types: - [Derived](/docs/build/derived) — An expression of other metrics, which allows you to do calculations on top of metrics. - [Ratio](/docs/build/ratio) — Create a ratio out of two measures, like revenue per customer. - [Simple](/docs/build/simple) — Metrics that refer directly to one measure. + ## Use case In the upcoming sections, we'll show how data practitioners currently calculate metrics and compare it to how MetricFlow makes defining metrics easier and more flexible. -The following example data schema image shows a number of different types of data tables: +The following example data is based on the Jaffle Shop repo. You can view the complete [dbt project](https://github.com/dbt-labs/jaffle-sl-template). The tables we're using in our example model are: -- `transactions` is a production data platform export that has been cleaned up and organized for analytical consumption -- `visits` is a raw event log -- `stores` is a cleaned-up and fully normalized dimensional table from a daily production database export -- `products` is a dimensional table that came from an external source such as a wholesale vendor of the goods this store sells. -- `customers` is a partially denormalized table in this case with a column derived from the transactions table through some upstream process +- `orders` is a production data platform export that has been cleaned up and organized for analytical consumption +- `customers` is a partially denormalized table in this case with a column derived from the orders table through some upstream process -![MetricFlow-SchemaExample](/img/docs/building-a-dbt-project/MetricFlow-SchemaExample.jpeg) + -To make this more concrete, consider the metric `revenue`, which is defined using the SQL expression: +To make this more concrete, consider the metric `order_total`, which is defined using the SQL expression: -`select sum(price * quantity) as revenue from transactions` - -This expression calculates the total revenue by multiplying the price and quantity for each transaction and then adding up all the results. In business settings, the metric `revenue` is often calculated according to different categories, such as: -- Time, for example `date_trunc(created_at, 'day')` -- Product, using `product_category` from the `product` table. +`select sum(order_total) as order_total from orders` +This expression calculates the revenue from each order by summing the order_total column in the orders table. In a business setting, the metric order_total is often calculated according to different categories, such as" +- Time, for example `date_trunc(ordered_at, 'day')` +- Order Type, using `is_food_order` dimension from the `orders` table. ### Calculate metrics @@ -95,149 +93,148 @@ Next, we'll compare how data practitioners currently calculate metrics with mult -The following example displays how data practitioners typically would calculate the revenue metric aggregated. It's also likely that analysts are asked for more details on a metric, like how much revenue came from bulk purchases. +The following example displays how data practitioners typically would calculate the `order_total` metric aggregated. It's also likely that analysts are asked for more details on a metric, like how much revenue came from new customers. Using the following query creates a situation where multiple analysts working on the same data, each using their own query method — this can lead to confusion, inconsistencies, and a headache for data management. ```sql select - date_trunc(transactions.created_at, 'day') as day - , products.category as product_category - , sum(transactions.price * transactions.quantity) as revenue + date_trunc('day',orders.ordered_at) as day, + case when customers.first_ordered_at is not null then true else false end as is_new_customer, + sum(orders.order_total) as order_total from - transactions + orders left join - products + customers on - transactions.product_id = products.product_id + orders.customer_id = customers.customer_id group by 1, 2 ``` -> Introducing MetricFlow, a key component of the dbt Semantic Layer 🤩 - simplifying data collaboration and governance. - -In the following three example tabs, use MetricFlow to define a semantic model that uses revenue as a metric and a sample schema to create consistent and accurate results — eliminating confusion, code duplication, and streamlining your workflow. +In the following three example tabs, use MetricFlow to define a semantic model that uses order_total as a metric and a sample schema to create consistent and accurate results — eliminating confusion, code duplication, and streamlining your workflow. -In this example, a measure named revenue is defined based on two columns in the `schema.transactions` table. The time dimension `ds` provides daily granularity and can be aggregated to weekly or monthly time periods. Additionally, a categorical dimension called `is_bulk_transaction` is specified using a case statement to capture bulk purchases. +In this example, a measure named `order_total` is defined based on the order_total column in the `orders` table. + +The time dimension `metric_time` provides daily granularity and can be aggregated into weekly or monthly time periods. Additionally, a categorical dimension called `is_new_customer` is specified in the `customers` semantic model. ```yaml semantic_models: - - name: transactions - description: "A record for every transaction that takes place. Carts are considered multiple transactions for each SKU." - owners: support@getdbt.com - model: (ref('transactions')) + - name: orders # The name of the semantic model + description: | + A model containing order data. The grain of the table is the order id. + model: ref('orders') #The name of the dbt model and schema defaults: - agg_time_dimension: metric_time - - # --- entities --- - entities: - - name: transaction_id + agg_time_dimension: metric_time + entities: # Entities, which usually correspond to keys in the table. + - name: order_id type: primary - - name: customer_id + - name: customer type: foreign - - name: store_id - type: foreign - - name: product_id - type: foreign - - # --- measures --- - measures: - - name: revenue - description: - expr: price * quantity - agg: sum - - name: quantity - description: Quantity of products sold - expr: quantity - agg: sum - - name: active_customers - description: A count of distinct customers completing transactions expr: customer_id - agg: count_distinct - - # --- dimensions --- - dimensions: + measures: # Measures, which are the aggregations on the columns in the table. + - name: order_total + agg: sum + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. - name: metric_time + expr: cast(ordered_at as date) type: time - expr: date_trunc('day', ts) type_params: time_granularity: day - - name: is_bulk_transaction + - name: customers # The name of the second semantic model + description: > + Customer dimension table. The grain of the table is one row per + customer. + model: ref('customers') #The name of the dbt model and schema + defaults: + agg_time_dimension: first_ordered_at + entities: # Entities, which usually correspond to keys in the table. + - name: customer + type: primary + expr: customer_id + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: is_new_customer type: categorical - expr: case when quantity > 10 then true else false end + expr: case when first_ordered_at is not null then true else false end + - name: first_ordered_at + type: time + type_params: + time_granularity: day + ``` - + -Similarly, you could then add a `products` semantic model on top of the `products` model to incorporate even more dimensions to slice and dice your revenue metric. - -Notice the identifiers present in the semantic models `products` and `transactions`. MetricFlow does the heavy-lifting for you by traversing the appropriate join keys to identify the available dimensions to slice and dice your `revenue` metric. +Similarly, you could then add additional dimensions like `is_food_order` to your semantic models to incorporate even more dimensions to slice and dice your revenue order_total. ```yaml semantic_models: - - name: products - description: A record for every product available through our retail stores. - owners: support@getdbt.com - model: ref('products') - - # --- identifiers --- - entities: - - name: product_id + - name: orders + description: | + A model containing order data. The grain of the table is the order id. + model: ref('orders') #The name of the dbt model and schema + defaults: + agg_time_dimension: metric_time + entities: # Entities, which usually correspond to keys in the table + - name: order_id type: primary - - # --- dimensions --- - dimensions: - - name: category - type: categorical - - name: brand - type: categorical - - name: is_perishable + - name: customer + type: foreign + expr: customer_id + measures: # Measures, which are the aggregations on the columns in the table. + - name: order_total + agg: sum + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: metric_time + expr: cast(ordered_at as date) + type: time + type_params: + time_granularity: day + - name: is_food_order type: categorical - expr: | - category in ("vegetables", "fruits", "dairy", "deli") ``` -Imagine an even more difficult metric is needed, like the amount of money earned each day by selling perishable goods per active customer. Without MetricFlow the data practitioner's original SQL might look like this: +Imagine an even more complex metric is needed, like the amount of money earned each day from food orders from returning customers. Without MetricFlow the data practitioner's original SQL might look like this: ```sql select - date_trunc(transactions.created_at, 'day') as day - , products.category as product_category - , sum(transactions.price * transactions.quantity) as revenue - , count(distinct customer_id) as active_customers - , sum(transactions.price * transactions.quantity)/count(distinct customer_id) as perishable_revenues_per_active_customer + date_trunc('day',orders.ordered_at) as day, + sum(case when is_food_order = true then order_total else null end) as food_order, + sum(orders.order_total) as sum_order_total, + food_order/sum_order_total from - transactions + orders left join - products + customers on - transactions.product_id = products.product_id -where - products.category in ("vegetables", "fruits", "dairy", "deli") -group by 1, 2 + orders.customer_id = customers.customer_id +where + case when customers.first_ordered_at is not null then true else false end = true +group by 1 ``` MetricFlow simplifies the SQL process via metric YAML configurations as seen below. You can also commit them to your git repository to ensure everyone on the data and business teams can see and approve them as the true and only source of information. ```yaml metrics: - - name: perishables_revenue_per_active_customer - description: Revenue from perishable goods (vegetables, fruits, dairy, deli) for each active store. + - name: food_order_pct_of_order_total + description: Revenue from food orders in each store + label: "Food % of Order Total" type: ratio type_params: - numerator: revenue + numerator: food_order denominator: active_customers filter: | - {{dimension('perishable_goods')}} in ('vegetables',' fruits', 'dairy', 'deli') + {{ Dimension('customer__is_new_customer')}} = true ``` @@ -268,7 +265,7 @@ metrics:
How does the Semantic Layer handle joins?
-
MetricFlow builds joins based on the types of keys and parameters that are passed to entities. To better understand how joins are constructed see our documentations on join types.

Rather than capturing arbitrary join logic, MetricFlow captures the types of each identifier and then helps the user to navigate to appropriate joins. This allows us to avoid the construction of fan out and chasm joins as well as generate legible SQL.
+
MetricFlow builds joins based on the types of keys and parameters that are passed to entities. To better understand how joins are constructed see our documentation on join types.

Rather than capturing arbitrary join logic, MetricFlow captures the types of each identifier and then helps the user to navigate to appropriate joins. This allows us to avoid the construction of fan out and chasm joins as well as generate legible SQL.
diff --git a/website/docs/docs/build/analyses.md b/website/docs/docs/build/analyses.md index cd74c2e052d..af6fb0320f0 100644 --- a/website/docs/docs/build/analyses.md +++ b/website/docs/docs/build/analyses.md @@ -2,11 +2,12 @@ title: "Analyses" description: "Read this tutorial to learn how to use custom analyses when building in dbt." id: "analyses" +pagination_next: null --- ## Overview -dbt's notion of `models` makes it easy for data teams to version control and collaborate on data transformations. Sometimes though, a certain sql statement doesn't quite fit into the mold of a dbt model. These more "analytical" sql files can be versioned inside of your dbt project using the `analysis` functionality of dbt. +dbt's notion of `models` makes it easy for data teams to version control and collaborate on data transformations. Sometimes though, a certain SQL statement doesn't quite fit into the mold of a dbt model. These more "analytical" SQL files can be versioned inside of your dbt project using the `analysis` functionality of dbt. Any `.sql` files found in the `analyses/` directory of a dbt project will be compiled, but not executed. This means that analysts can use dbt functionality like `{{ ref(...) }}` to select from models in an environment-agnostic way. diff --git a/website/docs/docs/build/build-metrics-intro.md b/website/docs/docs/build/build-metrics-intro.md index a87d4567a2b..cdac51224ed 100644 --- a/website/docs/docs/build/build-metrics-intro.md +++ b/website/docs/docs/build/build-metrics-intro.md @@ -5,51 +5,58 @@ description: "Learn about MetricFlow and build your metrics with semantic models sidebar_label: Build your metrics tags: [Metrics, Semantic Layer, Governance] hide_table_of_contents: true +pagination_next: "docs/build/sl-getting-started" +pagination_prev: null --- -Use MetricFlow in dbt to centrally define your metrics. MetricFlow is a key component of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer) and is responsible for SQL query construction and defining specifications for dbt semantic models and metrics. +Use MetricFlow in dbt to centrally define your metrics. As a key component of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), MetricFlow is responsible for SQL query construction and defining specifications for dbt semantic models and metrics. It uses familiar constructs like semantic models and metrics to avoid duplicative coding, optimize your development workflow, ensure data governance for company metrics, and guarantee consistency for data consumers. -Use familiar constructs like semantic models and metrics to avoid duplicative coding, optimize your development workflow, ensure data governance for company metrics, and guarantee consistency for data consumers. -:::info -MetricFlow is currently available on dbt Core v1.6 beta for [command line (CLI)](/docs/core/about-the-cli) users, with support for dbt Cloud and integrations coming soon. MetricFlow, a BSL package (code is source available), is a new way to define metrics in dbt and will replace the dbt_metrics package. +MetricFlow allows you to: +- Intuitively define metrics in your dbt project +- Develop from your preferred environment, whether that's the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), or [dbt Core](/docs/core/installation) +- Use [MetricFlow commands](/docs/build/metricflow-commands) to query and test those metrics in your development environment +- Harness the true magic of the universal dbt Semantic Layer and dynamically query these metrics in downstream tools (Available for dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) accounts only). -To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). -::: -Before you start, keep the following considerations in mind: -- Define metrics in YAML and query them using the [MetricFlow CLI](/docs/build/metricflow-cli). -- You must be on dbt Core v1.6 beta or higher to use MetricFlow. [Upgrade your dbt version](/docs/core/pip-install#change-dbt-core-versions) to get started. - * Note: Support for dbt Cloud and querying via external integrations coming soon. -- MetricFlow currently only supports Snowflake and Postgres. - * Note: Support for BigQuery, Databricks, and Redshift coming soon. -- dbt Labs is working with [integration partners](https://www.getdbt.com/product/semantic-layer-integrations) to develop updated integrations for the new Semantic Layer, powered by MetricFlow, in addition to introducing other consumption methods like Python and JDBC.

+
-
+ - - + icon="dbt-bit"/> + icon="dbt-bit"/> + icon="dbt-bit"/> + + + + +

diff --git a/website/docs/docs/build/cumulative-metrics.md b/website/docs/docs/build/cumulative-metrics.md index efdde600635..708045c1f3e 100644 --- a/website/docs/docs/build/cumulative-metrics.md +++ b/website/docs/docs/build/cumulative-metrics.md @@ -6,26 +6,77 @@ sidebar_label: Cumulative tags: [Metrics, Semantic Layer] --- -Cumulative metrics aggregate a measure over a given window. If no window is specified, the window is considered infinite and accumulates values over all time. +Cumulative metrics aggregate a measure over a given accumulation window. If no window is specified, the window is considered infinite and accumulates values over all time. You will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. -:::info MetricFlow time spine required +This metric is common for calculating things like weekly active users, or month-to-date revenue. The parameters, description, and type for cumulative metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `measure` | The measure you are referencing. | Required | +| `window` | The accumulation window, such as 1 month, 7 days, 1 year. This can't be used with `grain_to_date`. | Optional | +| `grain_to_date` | Sets the accumulation grain, such as month will accumulate data for one month. Then restart at the beginning of the next. This can't be used with `window`. | Optional | + +The following displays the complete specification for cumulative metrics, along with an example: + +```yaml +metrics: + - name: The metric name # Required + description: The metric description # Optional + type: cumulative # Required + label: The value that will be displayed in downstream tools # Required + type_params: # Required + measure: The measure you are referencing # Required + window: The accumulation window, such as 1 month, 7 days, 1 year. # Optional. Can not be used with window. + grain_to_date: Sets the accumulation grain, such as month will accumulate data for one month, then restart at the beginning of the next. # Optional. Cannot be used with grain_to_date + +``` + +## Limitations +Cumulative metrics are currently under active development and have the following limitations: + +1. You can only use the [`metric_time` dimension](/docs/build/dimensions#time) to check cumulative metrics. If you don't use `metric_time` in the query, the cumulative metric will return incorrect results because it won't perform the time spine join. This means you cannot reference time dimensions other than the `metric_time` in the query. +2. If you use `metric_time` in your query filter but don't include "start_time" and "end_time," cumulative metrics will left-censor the input data. For example, if you query a cumulative metric with a 7-day window with the filter `{{ TimeDimension('metric_time') }} BETWEEN '2023-08-15' AND '2023-08-30' `, the values for `2023-08-15` to `2023-08-20` return missing or incomplete data. This is because we apply the `metric_time` filter to the aggregation input. To avoid this, you must use `start_time` and `end_time` in the query filter. + + +## Cumulative metrics example + + +:::tip MetricFlow time spine required You will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. ::: +Cumulative metrics measure data over a given window and consider the window infinite when no window parameter is passed, accumulating the data over all time. + ```yaml -# Cumulative metrics aggregate a measure over a given window. The window is considered infinite if no window parameter is passed (accumulate the measure over all time) + metrics: -- name: wau_rolling_7 - owners: - - support@getdbt.com - type: cumulative - type_params: - measures: - - distinct_users - #Omitting window will accumulate the measure over all time - window: 7 days + - name: cumulative_order_total + label: Cumulative Order total (All Time) + description: The cumulative value of all orders + type: cumulative + type_params: + measure: order_total + - name: cumulative_order_total_l1m + label: Cumulative Order total (L1M) + description: Trailing 1 month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month + - name: cumulative_order_total_mtd + label: Cumulative Order total (MTD) + description: The month to date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month ``` ### Window options @@ -38,28 +89,25 @@ This section details examples of when you specify and don't specify window optio If a window option is specified, the MetricFlow framework applies a sliding window to the underlying measure. -Suppose the underlying measure `distinct_users` is configured as such to reflect a count of distinct users by user_id and user_status. +Suppose the underlying measure `customers` is configured to count the unique customers making orders at the Jaffle shop. ```yaml measures: - - name: distinct_users - description: The number of distinct users creating mql queries - expr: case when user_status in ('PENDING','ACTIVE') then user_id else null end - agg: count_distinct + - name: customers + expr: customer_id + agg: count_distinct + ``` -We can write a cumulative metric `wau_rolling_7` as such: +We can write a cumulative metric `weekly_customers` as such: ``` yaml metrics: - name: wau_rolling_7 - # Define the measure and the window. + - name: weekly_customers # Define the measure and the window. type: cumulative type_params: - measures: - - distinct_users - # the default window is infinity - omitting window will accumulate the measure over all time - window: 7 days + measure: customers + window: 7 days # Setting the window to 7 days since we want to track weekly active ``` From the sample YAML above, note the following: @@ -67,7 +115,7 @@ From the sample YAML above, note the following: * `type`: Specify cumulative to indicate the type of metric. * `type_params`: Specify the measure you want to aggregate as a cumulative metric. You have the option of specifying a `window`, or a `grain to date`. -For example, in the `wau_rolling_7` cumulative metric, MetricFlow takes a sliding 7-day window of relevant users and applies a count distinct function. +For example, in the `weekly_customers` cumulative metric, MetricFlow takes a sliding 7-day window of relevant customers and applies a count distinct function. If you omit the `window`, the measure will accumulate over all time. Otherwise, you can choose from granularities like day, week, quarter, or month, and describe the window using phrases like "7 days" or "1 month." @@ -86,32 +134,32 @@ Suppose you (a subscription-based company for the sake of this example) have an * `event_type`: (integer) a column that populates with +1 to indicate an added subscription, or -1 to indicate a deleted subscription. * `revenue`: (integer) a column that multiplies `event_type` and `subscription_revenue` to depict the amount of revenue added or lost for a specific date. -Using cumulative metrics without specifying a window, you can calculate running totals for metrics like the count of active subscriptions and revenue at any point in time. The following configuration YAML displays creating such cumulative metrics to obtain current revenue or total number of active subscriptions as a cumulative sum: +Using cumulative metrics without specifying a window, you can calculate running totals for metrics like the count of active subscriptions and revenue at any point in time. The following configuration YAML displays creating such cumulative metrics to obtain current revenue or the total number of active subscriptions as a cumulative sum: ```yaml -measures: - - name: revenue - description: Total revenue - agg: sum - expr: revenue - - name: subscription_count - description: Count of active subscriptions - agg: sum +measures: + - name: revenue + description: Total revenue + agg: sum + expr: revenue + - name: subscription_count + description: Count of active subscriptions + agg: sum expr: event_type +metrics: + - name: current_revenue + description: Current revenue + label: Current Revenue + type: cumulative + type_params: + measure: revenue + - name: active_subscriptions + description: Count of active subscriptions + label: Active Subscriptions + type: cumulative + type_params: + measure: subscription_count -metrics: -- name: current_revenue - description: Current revenue - type: cumulative - type_params: - measures: - - revenue -- name: active_subscriptions - description: Count of active subscriptions - type: cumulative - type_params: - measures: - - subscription_count ``` @@ -122,38 +170,32 @@ metrics: You can choose to specify a grain to date in your cumulative metric configuration to accumulate a metric from the start of a grain (such as week, month, or year). When using a window, such as a month, MetricFlow will go back one full calendar month. However, grain to date will always start accumulating from the beginning of the grain, regardless of the latest date of data. -For example, let's consider an underlying measure of `total_revenue.` +For example, let's consider an underlying measure of `order_total.` ```yaml -measures: - - name: total_revenue - description: Total revenue (summed) - agg: sum - expr: revenue + measures: + - name: order_total + agg: sum ``` We can compare the difference between a 1-month window and a monthly grain to date. The cumulative metric in a window approach applies a sliding window of 1 month, whereas the grain to date by month resets at the beginning of each month. ```yaml -metrics: - name: revenue_monthly_window #For this metric, we use a window of 1 month - description: Monthly revenue using a window of 1 month (think of this as a sliding window of 30 days) - type: cumulative - type_params: - measures: - - total_revenue - window: 1 month -``` - -```yaml -metrics: - name: revenue_monthly_grain_to_date #For this metric, we use a monthly grain to date - description: Monthly revenue using grain to date of 1 month (think of this as a monthly resetting point) - type: cumulative - type_params: - measures: - - total_revenue - grain_to_date: month +metrics: + - name: cumulative_order_total_l1m #For this metric, we use a window of 1 month + label: Cumulative Order total (L1M) + description: Trailing 1 month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month + - name: cumulative_order_total_mtd #For this metric, we use a monthly grain to date + label: Cumulative Order total (MTD) + description: The month to date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month ``` ### Implementation diff --git a/website/docs/docs/build/custom-aliases.md b/website/docs/docs/build/custom-aliases.md index 589d64f8510..b4962aad00a 100644 --- a/website/docs/docs/build/custom-aliases.md +++ b/website/docs/docs/build/custom-aliases.md @@ -34,6 +34,19 @@ select * from ... +Or in a `schema.yml` file. + + + +```yaml +- models: + - name: ga_sessions + config: + alias: sessions +``` + + + When referencing the `ga_sessions` model above from a different model, use the `ref()` function with the model's _filename_ as usual. For example: @@ -114,13 +127,11 @@ The default implementation of `generate_alias_name` simply uses the supplied `al - -### Managing different behaviors across packages +### Dispatch macro - SQL alias management for databases and dbt packages -See docs on macro `dispatch`: ["Managing different global overrides across packages"](/reference/dbt-jinja-functions/dispatch) +See docs on macro `dispatch`: ["Managing different global overrides across packages"](/reference/dbt-jinja-functions/dispatch#managing-different-global-overrides-across-packages) - ### Caveats @@ -128,20 +139,23 @@ See docs on macro `dispatch`: ["Managing different global overrides across packa Using aliases, it's possible to accidentally create models with ambiguous identifiers. Given the following two models, dbt would attempt to create two views with _exactly_ the same names in the database (ie. `sessions`): -```sql --- models/snowplow_sessions.sql + +```sql {{ config(alias='sessions') }} select * from ... ``` + -```sql --- models/sessions.sql + +```sql select * from ... ``` + + Whichever one of these models runs second would "win", and generally, the output of dbt would not be what you would expect. To avoid this failure mode, dbt will check if your model names and aliases are ambiguous in nature. If they are, you will be presented with an error message like this: ``` @@ -168,8 +182,8 @@ New in v1.5 **Related documentation:** -- [Model versions](govern/model-versions) -- [`versions`](resource-properties/versions#alias) +- [Model versions](/docs/collaborate/govern/model-versions) +- [`versions`](/reference/resource-properties/versions#alias) By default, dbt will create versioned models with the alias `_v`, where `` is that version's unique identifier. You can customize this behavior just like for non-versioned models by configuring a custom `alias` or re-implementing the `generate_alias_name` macro. diff --git a/website/docs/docs/build/custom-databases.md b/website/docs/docs/build/custom-databases.md index 300fd3147f1..dd54d6998e8 100644 --- a/website/docs/docs/build/custom-databases.md +++ b/website/docs/docs/build/custom-databases.md @@ -54,8 +54,6 @@ select * from ... ### generate_database_name -New in v0.16.0 - The database name generated for a model is controlled by a macro called `generate_database_name`. This macro can be overridden in a dbt project to change how dbt generates model database names. This macro works similarly to the [generate_schema_name](/docs/build/custom-schemas#advanced-custom-schema-configuration) macro. To override dbt's database name generation, create a macro named `generate_database_name` in your own dbt project. The `generate_database_name` macro accepts two arguments: diff --git a/website/docs/docs/build/custom-schemas.md b/website/docs/docs/build/custom-schemas.md index b8dbb9a0846..b20d4130725 100644 --- a/website/docs/docs/build/custom-schemas.md +++ b/website/docs/docs/build/custom-schemas.md @@ -1,6 +1,7 @@ --- title: "Custom schemas" id: "custom-schemas" +pagination_next: "docs/build/custom-databases" --- By default, all dbt models are built in the schema specified in your target. In dbt projects with lots of models, it may be useful to instead build some models in schemas other than your target schema – this can help logically group models together. @@ -103,7 +104,7 @@ To modify how dbt generates schema names, you should add a macro named `generate If you're modifying how dbt generates schema names, don't just replace ```{{ default_schema }}_{{ custom_schema_name | trim }}``` with ```{{ custom_schema_name | trim }}``` in the ```generate_schema_name``` macro. -Removing ```{{ default_schema }}``` causes developers to overriding each other's models when custom schemas are defined. This can also cause issues during development and continuous integration (CI). +If you remove ```{{ default_schema }}```, it causes developers to override each other's models if they create their own custom schemas. This can also cause issues during development and continuous integration (CI). ❌ The following code block is an example of what your code _should not_ look like: ```sql @@ -180,13 +181,6 @@ The following context methods _are_ available in the `generate_schema_name` macr ### Which vars are available in generate_schema_name? - - -Variable semantics have changed in dbt v0.17.0. See the [migration guide](/guides/migration/versions) -for more information on these changes. - - - Globally-scoped variables and variables defined on the command line with [--vars](/docs/build/project-variables) are accessible in the `generate_schema_name` context. diff --git a/website/docs/docs/build/custom-target-names.md b/website/docs/docs/build/custom-target-names.md index 4e14f36b784..ac7036de572 100644 --- a/website/docs/docs/build/custom-target-names.md +++ b/website/docs/docs/build/custom-target-names.md @@ -2,7 +2,7 @@ title: "Custom target names" id: "custom-target-names" description: "You can define a custom target name for any dbt Cloud job to correspond to settings in your dbt project." - +pagination_next: null --- ## dbt Cloud Scheduler diff --git a/website/docs/docs/build/derived-metrics.md b/website/docs/docs/build/derived-metrics.md index 0ca14d1c6f2..fc7961bbe7f 100644 --- a/website/docs/docs/build/derived-metrics.md +++ b/website/docs/docs/build/derived-metrics.md @@ -6,35 +6,154 @@ sidebar_label: Derived tags: [Metrics, Semantic Layer] --- -Derived metrics in MetricFlow refer to metrics that are created by defining an expression using other metrics. Derived metrics allow for calculations on top of metrics. For example, you can define a metric called "Net Sales Per User" by using other metrics in the calculation. +In MetricFlow, derived metrics are metrics created by defining an expression using other metrics. They enable you to perform calculations with existing metrics. This is helpful for combining metrics and doing math functions on aggregated columns, like creating a profit metric. + + The parameters, description, and type for derived metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `expr` | The derived expression. | Required | +| `metrics` | The list of metrics used in the derived metrics. | Required | +| `alias` | Optional alias for the metric that you can use in the expr. | Optional | +| `filter` | Optional filter to apply to the metric. | Optional | +| `offset_window` | Set the period for the offset window, such as 1 month. This will return the value of the metric one month from the metric time. | Required | + +The following displays the complete specification for derived metrics, along with an example. ```yaml metrics: - - name: net_sales_per_user + - name: the metric name # Required + description: the metric description # Optional + type: derived # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + expr: the derived expression # Required + metrics: # The list of metrics used in the derived metrics # Required + - name: the name of the metrics. must reference a metric you have already defined # Required + alias: optional alias for the metric that you can use in the expr # Optional + filter: optional filter to apply to the metric # Optional + offset_window: set the period for the offset window, such as 1 month. This will return the value of the metric one month from the metric time. # Required +``` + +## Derived metrics example + +```yaml +metrics: + - name: order_gross_profit + description: Gross profit from each order. + type: derived + label: Order Gross Profit + type_params: + expr: revenue - cost + metrics: + - name: order_total + alias: revenue + - name: order_cost + alias: cost + - name: food_order_gross_profit + label: Food Order Gross Profit + description: "The gross profit for each food order." type: derived type_params: - expr: gross_sales - cogs / active_users + expr: revenue - cost metrics: - - name: gross_sales # these are all metrics (can be a derived metric, meaning building a derived metric with derived metrics) - - name: cogs - - name: users - filter: | # Optional additional constraint - {{dimension('filter')}} is_active - alias: active_users # Optional alias to use in the expr + - name: order_total + alias: revenue + filter: | + {{ Dimension('order__is_food_order') }} = True + - name: order_cost + alias: cost + filter: | + {{ Dimension('order__is_food_order') }} = True + - name: order_total_growth_mom + description: "Percentage growth of orders total completed to 1 month ago" + type: derived + label: Order Total Growth % M/M + type_params: + expr: (order_total - order_total_prev_month)*100/order_total_prev_month + metrics: + - name: order_total + - name: order_total + offset_window: 1 month + alias: order_total_prev_month ``` ## Derived metric offset -You may want to use an offset value of a metric in the definition of a derived metric. For example, if you define retention rate as (active customers at the end of the month/active customers at the beginning of the month)-1 you can model this using a derived metric with an offset. +To perform calculations using a metric's value from a previous time period, you can add an offset parameter to a derived metric. For example, if you want to calculate period-over-period growth or track user retention, you can use this metric offset. + +**Note:** You must include the [`metric_time` dimension](/docs/build/dimensions#time) when querying a derived metric with an offset window. + +The following example displays how you can calculate monthly revenue growth using a 1-month offset window: ```yaml -metrics: -- name: user_retention - type: derived +- name: customer_retention + description: Percentage of customers that are active now and those active 1 month ago + label: customer_retention type_params: - expr: active_customers/active_customers_t1m + expr: (active_customers/ active_customers_prev_month) metrics: - - name: active_customers # these are all metrics (can be a derived metric, meaning building a derived metric with derived metrics) + - name: active_customers + alias: current_active_customers - name: active_customers offset_window: 1 month - alias: active_customers_t1m + alias: active_customers_prev_month +``` + +### Offset windows and granularity + +You can query any granularity and offset window combination. The following example queries a metric with a 7-day offset and a monthly grain: + +```yaml +- name: d7_booking_change + description: Difference between bookings now and 7 days ago + type: derived + label: d7 Bookings Change + type_params: + expr: bookings - bookings_7_days_ago + metrics: + - name: bookings + alias: current_bookings + - name: bookings + offset_window: 7 days + alias: bookings_7_days_ago +``` + +When you run the query `dbt sl query --metrics d7_booking_change --group-by metric_time__month` for the metric, here's how it's calculated. For dbt Core, you can use the `mf query` prefix. + +1. We retrieve the raw, unaggregated dataset with the specified measures and dimensions at the smallest level of detail, which is currently 'day'. +2. Then, we perform an offset join on the daily dataset, followed by performing a date trunc and aggregation to the requested granularity. + For example, to calculate `d7_booking_change` for July 2017: + - First, we sum up all the booking values for each day in July to calculate the bookings metric. + - The following table displays the range of days that make up this monthly aggregation. + +| | Orders | Metric_time | +| - | ---- | -------- | +| | 330 | 2017-07-31 | +| | 7030 | 2017-07-30 to 2017-07-02 | +| | 78 | 2017-07-01 | +| Total | 7438 | 2017-07-01 | + +3. Next, we calculate July's bookings with a 7-day offset. The following table displays the range of days that make up this monthly aggregation. Note that the month begins 7 days later (offset by 7 days) on 2017-07-24. + +| | Orders | Metric_time | +| - | ---- | -------- | +| | 329 | 2017-07-24 | +| | 6840 | 2017-07-23 to 2017-06-30 | +| | 83 | 2017-06-24 | +| Total | 7252 | 2017-07-01 | + +4. Lastly, we calculate the derived metric and return the final result set: + +```bash +bookings - bookings_7_days_ago would be compile as 7438 - 7252 = 186. +``` + +| d7_booking_change | metric_time__month | +| ----------------- | ------------------ | +| 186 | 2017-07-01 | diff --git a/website/docs/docs/build/dimensions.md b/website/docs/docs/build/dimensions.md index d154141791f..b8679fe11b0 100644 --- a/website/docs/docs/build/dimensions.md +++ b/website/docs/docs/build/dimensions.md @@ -8,20 +8,21 @@ tags: [Metrics, Semantic Layer] Dimensions is a way to group or filter information based on categories or time. It's like a special label that helps organize and analyze data. -In a data platform, dimensions is part of a larger structure called a semantic model. It's created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures), and used to add more details to your data that can't be easily added up or combined. In SQL, dimensions is typically included in the `dimensions` clause of your SQL query. +In a data platform, dimensions is part of a larger structure called a semantic model. It's created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures), and used to add more details to your data that can't be easily added up or combined. In SQL, dimensions is typically included in the `group by` clause of your SQL query. + -All dimensions require a `name`, `type` and in most cases, an `expr` parameter. +All dimensions require a `name`, `type` and in some cases, an `expr` parameter. -| Name | Parameter | Field type | -| --- | --- | --- | +| Parameter | Description | Type | +| --------- | ----------- | ---- | | `name` | Refers to the name of the group that will be visible to the user in downstream tools. It can also serve as an alias if the column name or SQL query reference is different and provided in the `expr` parameter.

Dimension names should be unique within a semantic model, but they can be non-unique across different models as MetricFlow uses [joins](/docs/build/join-logic) to identify the right dimension. | Required | -| `type` | Specifies the type of group created in the semantic model. There are three types:

— Categorical: Group rows in a table by categories like geography, product type, color, and so on.
— Time: Point to a date field in the data platform, and must be of type TIMESTAMP or equivalent in the data platform engine.
— Slowly-changing dimensions: Analyze metrics over time and slice them by groups that change over time, like sales trends by a customer's country. | Required | +| `type` | Specifies the type of group created in the semantic model. There are three types:

- **Categorical**: Group rows in a table by categories like geography, color, and so on.
- **Time**: Point to a date field in the data platform. Must be of type TIMESTAMP or equivalent in the data platform engine.
- **Slowly-changing dimensions**: Analyze metrics over time and slice them by groups that change over time, like sales trends by a customer's country. | Required | | `type_params` | Specific type params such as if the time is primary or used as a partition | Required | -| `description` | Description of the dimension | Optional | +| `description` | A clear description of the dimension | Optional | | `expr` | Defines the underlying column or SQL query for a dimension. If no `expr` is specified, MetricFlow will use the column with the same name as the group. You can use column name itself to input a SQL expression. | Optional | Refer to the following for the complete specification for dimensions: @@ -60,13 +61,30 @@ semantic_models: expr: case when quantity > 10 then true else false end ``` +MetricFlow requires that all dimensions have a primary entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. + +```yaml +semantic_model: + name: bookings_monthly_source + description: bookings_monthly_source + defaults: + agg_time_dimension: ds + model: ref('bookings_monthly_source') + measures: + - name: bookings_monthly + agg: sum + create_metric: true + primary_entity: booking_id +``` + ## Dimensions types -Dimensions have three types. This section further explains the definitions and provides examples. +Dimensions have 2 types. This section further explains the definitions and provides examples. -1. [Categorical](#categorical) -1. [Time](#time) -1. [Slowly changing](#scd-type-ii) +- [Dimensions types](#dimensions-types) + - [Categorical](#categorical) + - [Time](#time) + - [SCD Type II](#scd-type-ii) ### Categorical @@ -81,62 +99,44 @@ dimensions: ### Time -Time has additional parameters specified under the `type_params` section. :::tip use datetime data type if using BigQuery To use BigQuery as your data platform, time dimensions columns need to be in the datetime data type. If they are stored in another type, you can cast them to datetime using the `expr` property. Time dimensions are used to group metrics by different levels of time, such as day, week, month, quarter, and year. MetricFlow supports these granularities, which can be specified using the `time_granularity` parameter. ::: - +Time has additional parameters specified under the `type_params` section. When you query one or more metrics in MetricFlow using the CLI, the default time dimension for a single metric is the aggregation time dimension, which you can refer to as `metric_time` or use the dimensions' name. - +You can use multiple time groups in separate metrics. For example, the `users_created` metric uses `created_at`, and the `users_deleted` metric uses `deleted_at`: -To specify the default time dimensions for a measure or metric in MetricFlow, set the `is_primary` parameter to True. If your semantic model has multiple time dimensions, the non-primary ones should have `is_primary` set to False. To assign non-primary time dimensions to a measure, use the `agg_time_dimension` parameter and refer to the time dimensions defined in the section. -In the provided example, the semantic model has two-time groups, `created_at` and `deleted_at`, with `created_at` being the primary time dimension through `is_primary: True`. The `users_created` measure defaults to the primary time dimensions, while the `users_deleted` measure uses `deleted_at` as its time group. +```bash +# dbt Cloud users +dbt sl query --metrics users_created,users_deleted --dimensions metric_time --order metric_time -```yaml -dimensions: - - name: created_at - type: time - expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table - is_partition: True - type_params: - is_primary: True - time_granularity: day - - name: deleted_at - type: time - expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table - is_partition: True - type_params: - is_primary: False - time_granularity: day - -measures: - - name: users_deleted - expr: 1 - agg: sum - agg_time_dimension: deleted_at - - name: users_created - expr: 1 - agg: sum +# dbt Core users +mf query --metrics users_created,users_deleted --dimensions metric_time --order metric_time ``` -When querying one or more metrics in MetricFlow using the CLI, the default time dimension for a single metric is the primary time dimension, which can be referred to as metric_time or the dimensions' name. Multiple time groups can be used in separate metrics, such as users_created which uses created_at, and users_deleted which uses deleted_at. - ``` - mf query --metrics users_created,users_deleted --dimensions metric_time --order metric_time - ``` +You can set `is_partition` for time or categorical dimensions to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation detail (like daily, weekly, and so on): - + - + -`time_granularity` specifies the smallest level of detail that a measure or metric should be reported at, such as daily, weekly, monthly, quarterly, or yearly. Different granularity options are available, and each metric must have a specified granularity. For example, a metric that is specified with weekly granularity couldn't be aggregated to a daily grain. +Use `is_partition: True` to show that a dimension exists over a specific time window. For example, a date-partitioned dimensional table. When you query metrics from different tables, the dbt Semantic Layer uses this parameter to ensure that the correct dimensional values are joined to measures. -The current options for time granularity are day, week, month, quarter, and year. +You can also use `is_partition` for [categorical](#categorical) dimensions as well. -Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the highest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. +MetricFlow enables metric aggregation during query time. For example, you can aggregate the `messages_per_month` measure. If you originally had a `time_granularity` for the time dimensions `metric_time`, you can specify a yearly granularity for aggregation in your query: + +```bash +# dbt Cloud users +dbt sl query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year + +# dbt Core users +mf query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year +``` ```yaml dimensions: @@ -145,14 +145,12 @@ dimensions: expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table is_partition: True type_params: - is_primary: True time_granularity: day - name: deleted_at type: time expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table is_partition: True type_params: - is_primary: False time_granularity: day measures: @@ -167,16 +165,13 @@ measures: - - -Use `is_partition: True` to indicate that a dimension exists over a specific time window. For example, a date-partitioned dimensional table. When you query metrics from different tables, the Semantic Layer will use this parameter to ensure that the correct dimensional values are joined to measures. + -In addition, MetricFlow allows for easy aggregation of metrics at query time. For example, you can aggregate the `messages_per_month` measure, where the original `time_granularity` of the time dimensions `metrics_time`, at a yearly granularity by specifying it in the query in the CLI. +`time_granularity` specifies the smallest level of detail that a measure or metric should be reported at, such as daily, weekly, monthly, quarterly, or yearly. Different granularity options are available, and each metric must have a specified granularity. For example, a metric that is specified with weekly granularity couldn't be aggregated to a daily grain. -``` -mf query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year -``` +The current options for time granularity are day, week, month, quarter, and year. +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the highest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. ```yaml dimensions: @@ -185,20 +180,18 @@ dimensions: expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table is_partition: True type_params: - is_primary: True time_granularity: day - name: deleted_at type: time expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table is_partition: True type_params: - is_primary: False time_granularity: day measures: - name: users_deleted expr: 1 - agg: sum + agg: sum agg_time_dimension: deleted_at - name: users_created expr: 1 @@ -209,7 +202,6 @@ measures: - ### SCD Type II :::caution @@ -231,7 +223,7 @@ The following basic structure of an SCD Type II data platform table is supported **Note**: The SCD dimensions table must have `valid_to` and `valid_from` columns. -This is an example of SQL code that shows how a sample metric called `num_events` is joined with versioned dimensions data (stored in a table called `scd_dimensions`) using a natural key made up of the `entity_key` and `timestamp` columns. +This is an example of SQL code that shows how a sample metric called `num_events` is joined with versioned dimensions data (stored in a table called `scd_dimensions`) using a primary key made up of the `entity_key` and `timestamp` columns. ```sql @@ -289,7 +281,7 @@ semantic_models: entities: - name: sales_person - type: natural + type: primary expr: sales_person_id ``` @@ -363,7 +355,11 @@ In the sales tier example, if sales_person_id 456 is Tier 2 from 2022-03-08 onwa The following command or code represents how to return the count of transactions generated by each sales tier per month: -``` +```bash +# dbt Cloud users +dbt sl query --metrics transactions --dimensions metric_time__month,sales_person__tier --order metric_time__month --order sales_person__tier + +# dbt Core users mf query --metrics transactions --dimensions metric_time__month,sales_person__tier --order metric_time__month --order sales_person__tier ``` diff --git a/website/docs/docs/build/enhance-your-code.md b/website/docs/docs/build/enhance-your-code.md new file mode 100644 index 00000000000..5f2d48f6f5a --- /dev/null +++ b/website/docs/docs/build/enhance-your-code.md @@ -0,0 +1,38 @@ +--- +title: "Enhance your code" +description: "Learn how you can enhance your code" +pagination_next: "docs/build/project-variables" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + + +
\ No newline at end of file diff --git a/website/docs/docs/build/enhance-your-models.md b/website/docs/docs/build/enhance-your-models.md new file mode 100644 index 00000000000..46e7fa74353 --- /dev/null +++ b/website/docs/docs/build/enhance-your-models.md @@ -0,0 +1,23 @@ +--- +title: "Enhance your models" +description: "Learn how you can enhance your models" +pagination_next: "docs/build/materializations" +pagination_prev: null +--- + +
+ + + + + +
+
\ No newline at end of file diff --git a/website/docs/docs/build/exposures.md b/website/docs/docs/build/exposures.md index f58903a9726..65c0792e0a0 100644 --- a/website/docs/docs/build/exposures.md +++ b/website/docs/docs/build/exposures.md @@ -4,13 +4,6 @@ sidebar_label: "Exposures" id: "exposures" --- - - -* **v0.18.1**: Exposures are new! -* **v0.20.0**: Exposures support `tags` and `meta` properties - - - Exposures make it possible to define and describe a downstream use of your dbt project, such as in a dashboard, application, or data science pipeline. By defining exposures, you can then: - run, test, and list resources that feed into your exposure - populate a dedicated page in the auto-generated [documentation](/docs/collaborate/documentation) site with context relevant to data consumers diff --git a/website/docs/docs/build/groups.md b/website/docs/docs/build/groups.md index aa33db07ccc..d4fda045277 100644 --- a/website/docs/docs/build/groups.md +++ b/website/docs/docs/build/groups.md @@ -1,6 +1,6 @@ --- title: "Add groups to your DAG" -sidebar_title: "Groups" +sidebar_label: "Groups" id: "groups" description: "When you define groups in dbt projects, you turn implicit relationships into an explicit grouping." keywords: @@ -19,7 +19,7 @@ This functionality is new in v1.5. ## About groups -A group is a collection of nodes within a dbt DAG. Groups are named, and every group has an `owner`. They enable intentional collaboration within and across teams by restricting [access to private](/reference/resource-properties/access) models. +A group is a collection of nodes within a dbt DAG. Groups are named, and every group has an `owner`. They enable intentional collaboration within and across teams by restricting [access to private](/reference/resource-configs/access) models. Group members may include models, tests, seeds, snapshots, analyses, and metrics. (Not included: sources and exposures.) Each node may belong to only one group. @@ -94,7 +94,7 @@ select ... ### Referencing a model in a group -By default, all models within a group have the `protected` [access modifier](/reference/resource-properties/access). This means they can be referenced by downstream resources in _any_ group in the same project, using the [`ref`](/reference/dbt-jinja-functions/ref) function. If a grouped model's `access` property is set to `private`, only resources within its group can reference it. +By default, all models within a group have the `protected` [access modifier](/reference/resource-configs/access). This means they can be referenced by downstream resources in _any_ group in the same project, using the [`ref`](/reference/dbt-jinja-functions/ref) function. If a grouped model's `access` property is set to `private`, only resources within its group can reference it. diff --git a/website/docs/docs/build/hooks-operations.md b/website/docs/docs/build/hooks-operations.md index 1abc5657bad..85378498a36 100644 --- a/website/docs/docs/build/hooks-operations.md +++ b/website/docs/docs/build/hooks-operations.md @@ -4,6 +4,8 @@ description: "Read this tutorial to learn how to use hooks and operations when b id: "hooks-operations" --- +import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md'; + ## Related documentation * [pre-hook & post-hook](/reference/resource-configs/pre-hook-post-hook) * [on-run-start & on-run-end](/reference/project-configs/on-run-start-on-run-end) @@ -33,8 +35,8 @@ dbt provides hooks and operations so you can version control and execute these s Hooks are snippets of SQL that are executed at different times: * `pre-hook`: executed _before_ a model, seed or snapshot is built. * `post-hook`: executed _after_ a model, seed or snapshot is built. - * `on-run-start`: executed at the _start_ of `dbt run`, `dbt test`, `dbt seed` or `dbt snapshot` - * `on-run-end`: executed at the _end_ of `dbt run`, `dbt test`, `dbt seed` or `dbt snapshot` + * `on-run-start`: executed at the _start_ of + * `on-run-end`: executed at the _end_ of Hooks are a more-advanced capability that enable you to run custom SQL, and leverage database-specific actions, beyond what dbt makes available out-of-the-box with standard materializations and configurations. @@ -68,127 +70,6 @@ You can use hooks to provide database-specific functionality not available out-o -
- - - -### Examples using hooks - -Here's a minimal example of using hooks to grant privileges. For more information, see [`on-run-start` & `on-run-end` hooks](/reference/project-configs/on-run-start-on-run-end) and [`pre-hook` & `post-hook`](/reference/resource-configs/pre-hook-post-hook) reference sections. - - - -```yml -on-run-end: - - "grant usage on {{ target.schema }} to role reporter" - -models: - +post-hook: - - "grant select on {{ this }} to role reporter" - -``` - - - -You can also apply the `post-hook` to individual models using a `config` block: - - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to role reporter" - ] -) }} - -select ... - -``` - - - -You should use database-specific syntax when appropriate: - - - -
- - - -```sql -{{ config( - post_hook=[ - 'grant `roles/bigquery.dataViewer` on {{ this.type }} {{ this }} to "user:someone@yourcompany.com"' - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to `someone@yourcompany.com`" - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to reporter" - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to role reporter" - ] -) }} - -select ... - -``` - - - -
- -
-
### Calling a macro in a hook diff --git a/website/docs/docs/build/incremental-models.md b/website/docs/docs/build/incremental-models.md index dd20ca36a53..3a597499f04 100644 --- a/website/docs/docs/build/incremental-models.md +++ b/website/docs/docs/build/incremental-models.md @@ -79,14 +79,6 @@ A `unique_key` enables updating existing rows instead of just appending new rows Not specifying a `unique_key` will result in append-only behavior, which means dbt inserts all rows returned by the model's SQL into the preexisting target table without regard for whether the rows represent duplicates. - - -The optional `unique_key` parameter specifies a field that can uniquely identify each row within your model. You can define `unique_key` in a configuration block at the top of your model. If your model doesn't contain a single field that is unique, but rather a combination of columns, we recommend that you create a single column that can serve as a unique identifier (by concatenating and hashing those columns), and pass it into your model's configuration. - - - - - The optional `unique_key` parameter specifies a field (or combination of fields) that define the grain of your model. That is, the field(s) identify a single unique row. You can define `unique_key` in a configuration block at the top of your model, and it can be a single column name or a list of column names. The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](/terms/surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). @@ -101,8 +93,6 @@ When you pass a list in this way, please ensure that each column does not contai Alternatively, you can define a single-column [surrogate key](/terms/surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). ::: - - When you define a `unique_key`, you'll see this behavior for each row of "new" data returned by your dbt model: * If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database, [incremental strategy](#about-incremental_strategy), and [strategy specific configs](#strategy-specific-configs). @@ -395,12 +385,12 @@ models: cluster_by: ['session_start'] incremental_strategy: merge # this limits the scan of the existing table to the last 7 days of data - incremental_predicates: ["DBT_INTERNAL_DEST.session_start > datediff(day, -7, current_date)"] + incremental_predicates: ["DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date)"] # `incremental_predicates` accepts a list of SQL statements. # `DBT_INTERNAL_DEST` and `DBT_INTERNAL_SOURCE` are the standard aliases for the target table and temporary table, respectively, during an incremental run using the merge strategy. ``` -Alternatively, here are the same same configurations configured within a model file: +Alternatively, here are the same configurations configured within a model file: ```sql -- in models/my_incremental_model.sql @@ -412,7 +402,7 @@ Alternatively, here are the same same configurations configured within a model f cluster_by = ['session_start'], incremental_strategy = 'merge', incremental_predicates = [ - "DBT_INTERNAL_DEST.session_start > datediff(day, -7, current_date)" + "DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date)" ] ) }} @@ -430,7 +420,7 @@ merge into DBT_INTERNAL_DEST DBT_INTERNAL_DEST.id = DBT_INTERNAL_SOURCE.id and -- custom predicate: limits data scan in the "old" data / existing table - DBT_INTERNAL_DEST.session_start > datediff(day, -7, current_date) + DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date) when matched then update ... when not matched then insert ... ``` diff --git a/website/docs/docs/build/jinja-macros.md b/website/docs/docs/build/jinja-macros.md index 538a3a5e4c6..135db740f75 100644 --- a/website/docs/docs/build/jinja-macros.md +++ b/website/docs/docs/build/jinja-macros.md @@ -27,7 +27,7 @@ Jinja can be used in any SQL in a dbt project, including [models](/docs/build/sq :::info Ready to get started with Jinja and macros? -Check out the [tutorial on using Jinja](/guides/advanced/using-jinja) for a step-by-step example of using Jinja in a model, and turning it into a macro! +Check out the [tutorial on using Jinja](/guides/using-jinja) for a step-by-step example of using Jinja in a model, and turning it into a macro! ::: @@ -76,7 +76,7 @@ You can recognize Jinja based on the delimiters the language uses, which we refe When used in a dbt model, your Jinja needs to compile to a valid query. To check what SQL your Jinja compiles to: * **Using dbt Cloud:** Click the compile button to see the compiled SQL in the Compiled SQL pane -* **Using the dbt CLI:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once. +* **Using dbt Core:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once. ### Macros [Macros](/docs/build/jinja-macros) in Jinja are pieces of code that can be reused multiple times – they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models. Macros are defined in `.sql` files, typically in your `macros` directory ([docs](/reference/project-configs/macro-paths)). @@ -126,7 +126,7 @@ from app_data.payments ### Using a macro from a package -A number of useful macros have also been grouped together into [packages](docs/build/packages) — our most popular package is [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). +A number of useful macros have also been grouped together into [packages](/docs/build/packages) — our most popular package is [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). After installing a package into your project, you can use any of the macros in your own project — make sure you qualify the macro by prefixing it with the [package name](/reference/dbt-jinja-functions/project_name): diff --git a/website/docs/docs/build/join-logic.md b/website/docs/docs/build/join-logic.md index 96426424c87..9039822c9fd 100644 --- a/website/docs/docs/build/join-logic.md +++ b/website/docs/docs/build/join-logic.md @@ -114,7 +114,6 @@ semantic_models: - name: metric_time type: time type_params: - is_primary: true - name: user_signup entities: - name: user_id diff --git a/website/docs/docs/build/materializations.md b/website/docs/docs/build/materializations.md index 70c7878bd69..8846f4bb0c5 100644 --- a/website/docs/docs/build/materializations.md +++ b/website/docs/docs/build/materializations.md @@ -2,15 +2,17 @@ title: "Materializations" description: "Read this tutorial to learn how to use materializations when building in dbt." id: "materializations" +pagination_next: "docs/build/incremental-models" --- ## Overview -Materializations are strategies for persisting dbt models in a warehouse. There are four types of materializations built into dbt. They are: +Materializations are strategies for persisting dbt models in a warehouse. There are five types of materializations built into dbt. They are: - - - incremental - ephemeral +- materialized view ## Configuring materializations @@ -67,8 +69,8 @@ When using the `view` materialization, your model is rebuilt as a view on each r * **Pros:** No additional data is stored, views on top of source data will always have the latest records in them. * **Cons:** Views that perform a significant transformation, or are stacked on top of other views, are slow to query. * **Advice:** - * Generally start with views for your models, and only change to another materialization when you're noticing performance problems. - * Views are best suited for models that do not do significant transformation, e.g. renaming, recasting columns. + * Generally start with views for your models, and only change to another materialization when you notice performance problems. + * Views are best suited for models that do not do significant transformation, e.g. renaming, or recasting columns. ### Table When using the `table` materialization, your model is rebuilt as a on each run, via a `create table as` statement. @@ -82,7 +84,7 @@ When using the `table` materialization, your model is rebuilt as a -Options: - --search TEXT Filter available metrics by this search term - --show-all-dimensions Show all dimensions associated with a metric. - --help Show this message and exit. -``` - -## List dimensions - -This command lists all unique dimensions for a metric or multiple metrics. It displays only common dimensions when querying multiple metrics: - -```bash -mf list dimensions --metrics -Options: - --metrics SEQUENCE List dimensions by given metrics (intersection). Ex. - --metrics bookings,messages - --help Show this message and exit. -``` - -## List dimension-values - -This command lists all dimension values with the corresponding metric: - -```bash -mf list dimension-values --metrics --dimension -Options: - --dimension TEXT Dimension to query values from [required] - --metrics SEQUENCE Metrics that are associated with the dimension - [required] - --end-time TEXT Optional iso8601 timestamp to constraint the end time of - the data (inclusive) - --start-time TEXT Optional iso8601 timestamp to constraint the start time - of the data (inclusive) - --help Show this message and exit. -``` -## List entities - -This command lists all unique entities: - -```bash -mf list entities --metrics -Options: - --metrics SEQUENCE List entities by given metrics (intersection). Ex. - --metrics bookings,messages - --help Show this message and exit. -``` - -## Validate-configs - -This command performs validations against the defined semantic model configurations: - -```bash -mf validate-configs -Options: - --dw-timeout INTEGER Optional timeout for data warehouse - validation steps. Default None. - --skip-dw If specified, skips the data warehouse - validations - --show-all If specified, prints warnings and future- - errors - --verbose-issues If specified, prints any extra details - issues might have - --semantic-validation-workers INTEGER - Optional. Uses the number of workers - specified to run the semantic validations. - Should only be used for exceptionally large - configs - --help Show this message and exit. -``` - -## Health checks - -This command performs a health check against the data platform you provided in the configs: - -```bash -mf health-checks -``` - -## Tutorial - -Follow the dedicated MetricFlow tutorial to help you get started: - -```bash -mf tutorial -``` - -## Query - -Create a new query with MetricFlow, execute that query against the user's data platform, and return the result: - -```bash -mf query --metrics --group-by -Options: - --metrics SEQUENCE Metrics to query for: syntax is --metrics bookings - or for multiple metrics --metrics bookings,messages - --group-by SEQUENCE Dimensions and/or entities to group by: syntax is - --group-by ds or for multiple group bys --group-by - ds,org - --end-time TEXT Optional iso8601 timestamp to constraint the end - time of the data (inclusive) - --start-time TEXT Optional iso8601 timestamp to constraint the start - time of the data (inclusive) - --where TEXT SQL-like where statement provided as a string. For - example: --where "revenue > 100" - --limit TEXT Limit the number of rows out using an int or leave - blank for no limit. For example: --limit 100 - --order SEQUENCE Metrics or group bys to order by ("-" prefix for - DESC). For example: --order -ds or --order - ds,-revenue - --csv FILENAME Provide filepath for data frame output to csv - --explain In the query output, show the query that was - executed against the data warehouse - --show-dataflow-plan Display dataflow plan in explain output - --display-plans Display plans (e.g. metric dataflow) in the browser - --decimals INTEGER Choose the number of decimal places to round for - the numerical values - --show-sql-descriptions Shows inline descriptions of nodes in displayed SQL - --help Show this message and exit. - ``` - - -## Query examples - -The following tabs presents various different types of query examples that you can use to query metrics and dimensions. Select the tab that best suits your needs: - - - - - -**Example 1** — Use the example to query metrics by dimension and return the `order_amount` metric by `metric_time.` - -**Query** -```bash -mf query --metrics order_amount --group-by metric_time -``` - -**Result** -```bash -✔ Success 🦄 - query completed after 1.24 seconds -| METRIC_TIME | ORDER_AMOUNT | -|:--------------|---------------:| -| 2017-06-16 | 792.17 | -| 2017-06-17 | 458.35 | -| 2017-06-18 | 490.69 | -| 2017-06-19 | 749.09 | -| 2017-06-20 | 712.51 | -| 2017-06-21 | 541.65 | -``` - - - - -**Example 2** — You can include multiple dimensions in a query. For example, you can group by the `is_food_order` dimension to confirm if orders were for food or not. - -**Query** -```bash -mf query --metrics order_amount --group-by metric_time, is_food_order -``` - -**Result** -```bash - Success 🦄 - query completed after 1.70 seconds -| METRIC_TIME | IS_FOOD_ORDER | ORDER_AMOUNT | -|:--------------|:----------------|---------------:| -| 2017-06-16 | True | 499.27 | -| 2017-06-16 | False | 292.90 | -| 2017-06-17 | True | 431.24 | -| 2017-06-17 | False | 27.11 | -| 2017-06-18 | True | 466.45 | -| 2017-06-18 | False | 24.24 | -| 2017-06-19 | False | 300.98 | -| 2017-06-19 | True | 448.11 | -``` - - - - - - -**Example 3** — You can add order and limit functions to filter and present the data in a readable format. The following query limits the data set to 10 records and orders them by `metric_time`, descending. - -**Query** -```bash -mf query --metrics order_amount --group-by metric_time,is_food_order --limit 10 --order -metric_time -``` - -**Result** -```bash -✔ Success 🦄 - query completed after 1.41 seconds -| METRIC_TIME | IS_FOOD_ORDER | ORDER_AMOUNT | -|:--------------|:----------------|---------------:| -| 2017-08-31 | True | 459.90 | -| 2017-08-31 | False | 327.08 | -| 2017-08-30 | False | 348.90 | -| 2017-08-30 | True | 448.18 | -| 2017-08-29 | True | 479.94 | -| 2017-08-29 | False | 333.65 | -| 2017-08-28 | False | 334.73 | -``` - - - - -**Example 4** — You can further filter the data set by adding a `where` clause to your query. - -**Query** -```bash - mf query --metrics order_amount --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" -``` - -**Result** -```bash - ✔ Success 🦄 - query completed after 1.06 seconds -| METRIC_TIME | IS_FOOD_ORDER | ORDER_AMOUNT | -|:--------------|:----------------|---------------:| -| 2017-08-31 | True | 459.90 | -| 2017-08-30 | True | 448.18 | -| 2017-08-29 | True | 479.94 | -| 2017-08-28 | True | 513.48 | -| 2017-08-27 | True | 568.92 | -| 2017-08-26 | True | 471.95 | -| 2017-08-25 | True | 452.93 | -| 2017-08-24 | True | 384.40 | -| 2017-08-23 | True | 423.61 | -| 2017-08-22 | True | 401.91 | -``` - - - - - -**Example 5** — To filter by time, there are dedicated start and end time options. Using these options to filter by time allows MetricFlow to further optimize query performance by pushing down the where filter when appropriate. - -**Query** -```bash - mf query --metrics order_amount --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' -``` - - **Result** -```bash -✔ Success 🦄 - query completed after 1.53 seconds -| METRIC_TIME | IS_FOOD_ORDER | ORDER_AMOUNT | -|:--------------|:----------------|---------------:| -| 2017-08-27 | True | 568.92 | -| 2017-08-26 | True | 471.95 | -| 2017-08-25 | True | 452.93 | -| 2017-08-24 | True | 384.40 | -| 2017-08-23 | True | 423.61 | -| 2017-08-22 | True | 401.91 | -``` - - - - - - - -### Additional query examples - -The following tabs presents additional query examples, like exporting to a CSV. Select the tab that best suits your needs: - - - - - - - -**Example 6** — Add `--explain` to your query to view the SQL generated by MetricFlow. - -**Query** - -```bash - mf query --metrics order_amount --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --explain -``` - - **Result** - ```bash - ✔ Success 🦄 - query completed after 0.28 seconds -🔎 SQL (remove --explain to see data or add --show-dataflow-plan to see the generated dataflow plan): -SELECT - metric_time - , is_food_order - , SUM(order_cost) AS order_amount -FROM ( - SELECT - cast(ordered_at as date) AS metric_time - , is_food_order - , order_cost - FROM ANALYTICS.js_dbt_sl_demo.orders orders_src_1 - WHERE cast(ordered_at as date) BETWEEN CAST('2017-08-22' AS TIMESTAMP) AND CAST('2017-08-27' AS TIMESTAMP) -) subq_3 -WHERE is_food_order = True -GROUP BY - metric_time - , is_food_order -ORDER BY metric_time DESC -LIMIT 10 -``` - - - - - -**Example 7** — Add the `--csv file_name.csv` flag to export the results of your query to a csv. - -**Query** - -```bash -mf query --metrics order_amount --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --csv query_example.csv -``` - -**Result** -```bash -✔ Success 🦄 - query completed after 0.83 seconds -🖨 Successfully written query output to query_example.csv -``` - - - diff --git a/website/docs/docs/build/metricflow-commands.md b/website/docs/docs/build/metricflow-commands.md new file mode 100644 index 00000000000..4d2477ad2ed --- /dev/null +++ b/website/docs/docs/build/metricflow-commands.md @@ -0,0 +1,558 @@ +--- +title: MetricFlow commands +id: metricflow-commands +description: "Query metrics and metadata in your dbt project with the MetricFlow commands." +sidebar_label: "MetricFlow commands" +tags: [Metrics, Semantic Layer] +--- + +Once you define metrics in your dbt project, you can query metrics, dimensions, and dimension values, and validate your configs using the MetricFlow commands. + +MetricFlow allows you to define and query metrics in your dbt project in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), or [dbt Core](/docs/core/installation). To experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and dynamically query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. + +MetricFlow is compatible with Python versions 3.8, 3.9, 3.10, and 3.11. + + +## MetricFlow + +MetricFlow is a dbt package that allows you to define and query metrics in your dbt project. You can use MetricFlow to query metrics in your dbt project in the dbt Cloud CLI, dbt Cloud IDE, or dbt Core. + +**Note** — MetricFlow commands aren't supported in dbt Cloud jobs yet. However, you can add MetricFlow validations with your git provider (such as GitHub Actions) by installing MetricFlow (`pip install metricflow`). This allows you to run MetricFlow commands as part of your continuous integration checks on PRs. + + + + + +MetricFlow commands are embedded in the dbt Cloud CLI, which means you can immediately run them once you install the dbt Cloud CLI. + +A benefit to using the dbt Cloud is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. + + + + + +:::info +You can create metrics using MetricFlow in the dbt Cloud IDE. However, support for running MetricFlow commands in the IDE will be available soon. +::: + +A benefit to using the dbt Cloud is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. + + + + + + +:::info Use dbt Cloud CLI for semantic layer development + +Use the dbt Cloud CLI for the experience in defining and querying metrics in your dbt project on dbt Cloud or dbt Core with MetricFlow. + +A benefit to using the dbt Cloud is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning. +::: + + +You can install [MetricFlow](https://github.com/dbt-labs/metricflow#getting-started) from [PyPI](https://pypi.org/project/dbt-metricflow/). You need to use `pip` to install MetricFlow on Windows or Linux operating systems: + +1. Create or activate your virtual environment `python -m venv venv` +2. Run `pip install dbt-metricflow` + * You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `pip install "dbt-metricflow[snowflake]"` + +**Note**, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. + + + + + +Something to note, MetricFlow `mf` commands return an error if you have a Metafont latex package installed. To run `mf` commands, uninstall the package. + +## MetricFlow commands + +MetricFlow provides the following commands to retrieve metadata and query metrics. + + + + +Use the `dbt sl` prefix before the command name to execute them in dbt Cloud. For example, to list all metrics, run `dbt sl list metrics`. + +- [`list`](#list) — Retrieves metadata values. +- [`list metrics`](#list-metrics) — Lists metrics with dimensions. +- [`list dimensions`](#list) — Lists unique dimensions for metrics. +- [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. +- [`list entities`](#list-entities) — Lists all unique entities. +- [`query`](#query) — Query metrics and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. + + + + + + + +Use the `mf` prefix before the command name to execute them in dbt Core. For example, to list all metrics, run `mf list metrics`. + +- [`list`](#list) — Retrieves metadata values. +- [`list metrics`](#list-metrics) — Lists metrics with dimensions. +- [`list dimensions`](#list) — Lists unique dimensions for metrics. +- [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. +- [`list entities`](#list-entities) — Lists all unique entities. +- [`validate-configs`](#validate-configs) — Validates semantic model configurations. +- [`health-checks`](#health-checks) — Performs data platform health check. +- [`tutorial`](#tutorial) — Dedicated MetricFlow tutorial to help get you started. +- [`query`](#query) — Query metrics and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. + + + + +### List + +This command retrieves metadata values related to [Metrics](/docs/build/metrics-overview), [Dimensions](/docs/build/dimensions), and [Entities](/docs/build/entities) values. + + +### List metrics + +```bash +dbt sl list # In dbt Cloud +mf list # In dbt Core +``` +This command lists the metrics with their available dimensions: + +```bash +dbt sl list metrics # In dbt Cloud + +mf list metrics # In dbt Core + +Options: + --search TEXT Filter available metrics by this search term + --show-all-dimensions Show all dimensions associated with a metric. + --help Show this message and exit. +``` + +### List dimensions + +This command lists all unique dimensions for a metric or multiple metrics. It displays only common dimensions when querying multiple metrics: + +```bash +dbt sl list dimensions --metrics # In dbt Cloud + +mf list dimensions --metrics # In dbt Core + +Options: + --metrics SEQUENCE List dimensions by given metrics (intersection). Ex. --metrics bookings,messages + --help Show this message and exit. +``` + +## List dimension-values + +This command lists all dimension values with the corresponding metric: + +```bash +dbt sl list dimension-values --metrics --dimension # In dbt Cloud + +mf list dimension-values --metrics --dimension # In dbt Core + +Options: + --dimension TEXT Dimension to query values from [required] + --metrics SEQUENCE Metrics that are associated with the dimension + [required] + --end-time TEXT Optional iso8601 timestamp to constraint the end time of + the data (inclusive) + --start-time TEXT Optional iso8601 timestamp to constraint the start time + of the data (inclusive) + --help Show this message and exit. +``` + +### List entities + +This command lists all unique entities: + +```bash +dbt sl list entities --metrics # In dbt Cloud + +mf list entities --metrics # In dbt Core + +Options: + --metrics SEQUENCE List entities by given metrics (intersection). Ex. --metrics bookings,messages + --help Show this message and exit. +``` + +### Validate-configs + +The following command performs validations against the defined semantic model configurations. + +Note, in dbt Cloud you don't need to validate the Semantic Layer config separately. Running a dbt command (such as `dbt parse`, `dbt build`, `dbt compile`, `dbt run`) automatically checks it. + +```bash + +mf validate-configs # In dbt Core + +Options: + --dw-timeout INTEGER Optional timeout for data warehouse + validation steps. Default None. + --skip-dw If specified, skips the data warehouse + validations + --show-all If specified, prints warnings and future- + errors + --verbose-issues If specified, prints any extra details + issues might have + --semantic-validation-workers INTEGER + Optional. Uses the number of workers + specified to run the semantic validations. + Should only be used for exceptionally large + configs + --help Show this message and exit. +``` + +### Health checks + +The following command performs a health check against the data platform you provided in the configs. + +Note, in dbt Cloud the `health-checks` command isn't required since it uses dbt Cloud's credentials to perform the health check. + +```bash +mf health-checks # In dbt Core +``` + +### Tutorial + +Follow the dedicated MetricFlow tutorial to help you get started: + + +```bash +mf tutorial # In dbt Core +``` + +### Query + +Create a new query with MetricFlow, execute that query against the user's data platform, and return the result: + +```bash +dbt sl query --metrics --group-by # In dbt Cloud + +mf query --metrics --group-by # In dbt Core + +Options: + + --metrics SEQUENCE Metrics to query for: syntax is --metrics bookings + or for multiple metrics --metrics bookings, messages. + + --group-by SEQUENCE Dimensions and/or entities to group by: syntax is + --group-by ds or for multiple group bys --group-by + ds, org. + + --end-time TEXT Optional iso8601 timestamp to constraint the end + time of the data (inclusive) + + --start-time TEXT Optional iso8601 timestamp to constraint the start + time of the data (inclusive) + + --where TEXT SQL-like where statement provided as a string. For + example: --where "revenue > 100". To add a dimension filter to + a where filter, you have to indicate that the filter item is part of your model. + Refer to the [FAQ](#faqs) for more info on how to do this using a template wrapper. + + --limit TEXT Limit the number of rows out using an int or leave + blank for no limit. For example: --limit 100 + + --order SEQUENCE Metrics or group bys to order by ("-" prefix for + DESC). For example: --order -ds or --order + ds,-revenue + + --csv FILENAME Provide filepath for data frame output to csv + + --compile (dbt Cloud) In the query output, show the query that was + --explain (dbt Core) executed against the data warehouse + + + --show-dataflow-plan Display dataflow plan in explain output + + --display-plans Display plans (such as metric dataflow) in the browser + + --decimals INTEGER Choose the number of decimal places to round for + the numerical values + + --show-sql-descriptions Shows inline descriptions of nodes in displayed SQL + + --help Show this message and exit. + ``` + + +### Query examples + +The following tabs present various different types of query examples that you can use to query metrics and dimensions. Select the tab that best suits your needs: + + + + + +Use the example to query metrics by dimension and return the `order_total` metric by `metric_time.` + +**Query** +```bash +dbt sl query --metrics order_total --group-by metric_time # In dbt Cloud + +mf query --metrics order_total --group-by metric_time # In dbt Core +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 1.24 seconds +| METRIC_TIME | ORDER_TOTAL | +|:--------------|---------------:| +| 2017-06-16 | 792.17 | +| 2017-06-17 | 458.35 | +| 2017-06-18 | 490.69 | +| 2017-06-19 | 749.09 | +| 2017-06-20 | 712.51 | +| 2017-06-21 | 541.65 | +``` + + + + +You can include multiple dimensions in a query. For example, you can group by the `is_food_order` dimension to confirm if orders were for food or not. + +**Query** +```bash +dbt sl query --metrics order_total --group-by metric_time, is_food_order # In dbt Cloud + +mf query --metrics order_total --group-by metric_time, is_food_order # In dbt Core +``` + +**Result** +```bash + Success 🦄 - query completed after 1.70 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-06-16 | True | 499.27 | +| 2017-06-16 | False | 292.90 | +| 2017-06-17 | True | 431.24 | +| 2017-06-17 | False | 27.11 | +| 2017-06-18 | True | 466.45 | +| 2017-06-18 | False | 24.24 | +| 2017-06-19 | False | 300.98 | +| 2017-06-19 | True | 448.11 | +``` + + + + + + +You can add order and limit functions to filter and present the data in a readable format. The following query limits the data set to 10 records and orders them by `metric_time`, descending. + +**Query** +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 1.41 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-31 | True | 459.90 | +| 2017-08-31 | False | 327.08 | +| 2017-08-30 | False | 348.90 | +| 2017-08-30 | True | 448.18 | +| 2017-08-29 | True | 479.94 | +| 2017-08-29 | False | 333.65 | +| 2017-08-28 | False | 334.73 | +``` + + + + +You can further filter the data set by adding a `where` clause to your query. + +**Query** + +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time --where "{{ Dimension('order_id__is_food_order') }} = True" + +# In dbt Core +mf query --metrics order_total --group-by metric_time --where "{{ Dimension('order_id__is_food_order') }} = True" +``` + +**Result** +```bash + ✔ Success 🦄 - query completed after 1.06 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-31 | True | 459.90 | +| 2017-08-30 | True | 448.18 | +| 2017-08-29 | True | 479.94 | +| 2017-08-28 | True | 513.48 | +| 2017-08-27 | True | 568.92 | +| 2017-08-26 | True | 471.95 | +| 2017-08-25 | True | 452.93 | +| 2017-08-24 | True | 384.40 | +| 2017-08-23 | True | 423.61 | +| 2017-08-22 | True | 401.91 | +``` + + + + + +To filter by time, there are dedicated start and end time options. Using these options to filter by time allows MetricFlow to further optimize query performance by pushing down the where filter when appropriate. + +**Query** +```bash + +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' +``` + + **Result** +```bash +✔ Success 🦄 - query completed after 1.53 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-27 | True | 568.92 | +| 2017-08-26 | True | 471.95 | +| 2017-08-25 | True | 452.93 | +| 2017-08-24 | True | 384.40 | +| 2017-08-23 | True | 423.61 | +| 2017-08-22 | True | 401.91 | +``` + + + + + + + +### Additional query examples + +The following tabs present additional query examples, like exporting to a CSV. Select the tab that best suits your needs: + + + + + + + +Add `--compile` (or `--explain` for dbt Core users) to your query to view the SQL generated by MetricFlow. + +**Query** + +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --compile + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --explain +``` + + **Result** + ```bash + ✔ Success 🦄 - query completed after 0.28 seconds +🔎 SQL (remove --compile to see data or add --show-dataflow-plan to see the generated dataflow plan): +SELECT + metric_time + , is_food_order + , SUM(order_cost) AS order_total +FROM ( + SELECT + cast(ordered_at as date) AS metric_time + , is_food_order + , order_cost + FROM ANALYTICS.js_dbt_sl_demo.orders orders_src_1 + WHERE cast(ordered_at as date) BETWEEN CAST('2017-08-22' AS TIMESTAMP) AND CAST('2017-08-27' AS TIMESTAMP) +) subq_3 +WHERE is_food_order = True +GROUP BY + metric_time + , is_food_order +ORDER BY metric_time DESC +LIMIT 10 +``` + + + + + +Add the `--csv file_name.csv` flag to export the results of your query to a csv. + +**Query** + +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --csv query_example.csv + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --csv query_example.csv +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 0.83 seconds +🖨 Successfully written query output to query_example.csv +``` + + + + +### Time granularity + +Optionally, you can specify the time granularity you want your data to be aggregated at by appending two underscores and the unit of granularity you want to `metric_time`, the global time dimension. You can group the granularity by: `day`, `week`, `month`, `quarter`, and `year`. + +Below is an example for querying metric data at a monthly grain: + +```bash +dbt sl query --metrics revenue --group-by metric_time__month # In dbt Cloud + +mf query --metrics revenue --group-by metric_time__month # In dbt Core +``` + +## FAQs + +
+How can I add a dimension filter to a where filter? + +To add a dimension filter to a where filter, you have to indicate that the filter item is part of your model and use a template wrapper: {{Dimension('primary_entity__dimension_name')}}. + +Here's an example query: dbt sl query --metrics order_total --group-by metric_time --where "{{Dimension('order_id__is_food_order')}} = True".

Before using the template wrapper, however, set up your terminal to escape curly braces for the filter template to work. + +
+How to set up your terminal to escape curly braces? + To configure your .zshrcprofile to escape curly braces, you can use the setopt command to enable the BRACECCL option. This option will cause the shell to treat curly braces as literals and prevent brace expansion. Refer to the following steps to set it up:
+ +1. Open your terminal. +2. Open your .zshrc file using a text editor like nano, vim, or any other text editor you prefer. You can use the following command to open it with nano: + +```bash +nano ~/.zshrc +``` +3. Add the following line to the file: + +```bash +setopt BRACECCL +``` +4. Save and exit the text editor (in `nano`, press Ctrl + O to save, and Ctrl + X to exit). + +5. Source your .zshrc file to apply the changes: + +```bash +source ~/.zshrc +``` + +6. After making these changes, your Zsh shell will treat curly braces as literal characters and will not perform brace expansion. This means that you can use curly braces without worrying about unintended expansions. + +Keep in mind that modifying your shell configuration files can have an impact on how your shell behaves. If you're not familiar with shell configuration, it's a good idea to make a backup of your .zshrc file before making any changes. If you encounter any issues or unexpected behavior, you can revert to the backup. + + +
+ +
+ diff --git a/website/docs/docs/build/metricflow-time-spine.md b/website/docs/docs/build/metricflow-time-spine.md index bfab9c76bca..997d85e38a8 100644 --- a/website/docs/docs/build/metricflow-time-spine.md +++ b/website/docs/docs/build/metricflow-time-spine.md @@ -9,10 +9,12 @@ tags: [Metrics, Semantic Layer] MetricFlow uses a timespine table to construct cumulative metrics. By default, MetricFlow expects the timespine table to be named `metricflow_time_spine` and doesn't support using a different name. To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: + -```sql + +```sql {{ config( materialized = 'table', @@ -38,15 +40,53 @@ final as ( select * from final ``` + + + + + +```sql +{{ + config( + materialized = 'table', + ) +}} + +with days as ( + + {{ + dbt.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2027','mm/dd/yyyy')" + ) + }} + +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * from final +``` + + + + + ```sql ---- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery +-- filename: metricflow_time_spine.sql +-- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery {{config(materialized='table')}} with days as ( - {{dbt_utils.date_spine('day' - , "DATE(2000,01,01)" - , "DATE(2030,01,01)" + {{dbt_utils.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2030,01,01)" ) }} ), @@ -59,4 +99,33 @@ final as ( select * from final ``` + + + + + +```sql +-- filename: metricflow_time_spine.sql +-- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery +{{config(materialized='table')}} +with days as ( + {{dbt.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2030,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final +``` + + + You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but it doesn't currently support finer grains. diff --git a/website/docs/docs/build/metrics-overview.md b/website/docs/docs/build/metrics-overview.md index 9f04cab1b82..81af149a7d9 100644 --- a/website/docs/docs/build/metrics-overview.md +++ b/website/docs/docs/build/metrics-overview.md @@ -4,19 +4,23 @@ id: metrics-overview description: "Metrics can be defined in the same or separate YAML files from semantic models within the same dbt project repo." sidebar_label: "Creating metrics" tags: [Metrics, Semantic Layer] +pagination_next: "docs/build/cumulative" --- Once you've created your semantic models, it's time to start adding metrics! Metrics can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo) The keys for metrics definitions are: -| Component | Description | Type | +| Parameter | Description | Type | | --------- | ----------- | ---- | | `name` | Provide the reference name for the metric. This name must be unique amongst all metrics. | Required | -| `type` | Define the type of metric, which can be a measure (`simple`) or ratio (`ratio`)). | Optional | +| `description` | Provide the description for your metric. | Optional | +| `type` | Define the type of metric, which can be `simple`, `ratio`, `cumulative`, or `derived`. | Required | | `type_params` | Additional parameters used to configure metrics. `type_params` are different for each metric type. | Required | -| `filter` | For any type of metric, you may optionally include a filter string, which applies a dimensional filter when computing the metric. You can think of this as your WHERE clause. | Optional | -| `meta` | Additional metadata you want to add to your metric. | +| `config` | Provide the specific configurations for your metric. | Optional | +| `label` | The display name for your metric. This value will be shown in downstream tools. | Required | +| `filter` | You can optionally add a filter string to any metric type, applying filters to dimensions, entities, or time dimensions during metric computation. Consider it as your WHERE clause. | Optional | +| `meta` | Additional metadata you want to add to your metric. | Optional | Here's a complete example of the metrics spec configuration: @@ -28,17 +32,17 @@ metrics: type: the type of the metric ## Required type_params: ## Required - specific properties for the metric type - configs: here for `enabled` ## Optional + config: here for `enabled` ## Optional label: The display name for your metric. This value will be shown in downstream tools. ## Required filter: | ## Optional - {{ dimension('name') }} > 0 and {{ dimension(' another name') }} is not + {{ Dimension('entity__name') }} > 0 and {{ Dimension(' entity__another_name') }} is not null ``` This page explains the different supported metric types you can add to your dbt project. + ## Related docs diff --git a/website/docs/docs/build/metrics.md b/website/docs/docs/build/metrics.md index 4ce7372e7d0..7afcb41c2e4 100644 --- a/website/docs/docs/build/metrics.md +++ b/website/docs/docs/build/metrics.md @@ -4,43 +4,38 @@ id: "metrics" description: "When you define metrics in dbt projects, you encode crucial business logic in tested, version-controlled code. The dbt metrics layer helps you standardize metrics within your organization." keywords: - dbt metrics layer +tags: [Metrics] --- - - -:::info dbt Metrics isn't supported - -dbt Metrics is no longer supported in v1.6 and higher. To build your semantic layer, define and query metrics, and provide data governance - refer to [Build your Semantic Layer](/docs/build/build-metrics-intro) for updated guidance. - -::: - +import DeprecationNotice from '/snippets/_sl-deprecation-notice.md'; - + -:::info dbt Metrics not recommended + + -dbt Metrics won't be supported in v1.6 and higher, and is being replaced with MetricFlow. [Defining metrics](/docs/build/build-semantic-layer-intro) with MetricFlow will help shape the future of the dbt Semantic Layer — let us know [your thoughts and join the convo](https://github.com/dbt-labs/dbt-core/discussions/7456) to help build it! +The dbt Semantic Layer has undergone a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), improving governance, introducing new APIs, and making it more efficient to define/query metrics. This revamp means the dbt_metrics package and the legacy Semantic Layer, available in dbt v1.5 or lower, are no longer supported and won't receive any code fixes. -::: +**What’s changed?**

+The dbt_metrics package has been [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. We will also remove the dbt_metrics spec and docs when it's fully deprecated. +**Who does this affect?**

+Anyone who uses the dbt_metrics package or is integrated with the legacy Semantic Layer. The new Semantic Layer is available to [Team or Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plans [hosted in North America](/docs/cloud/about-cloud/regions-ip-addresses). You must be on dbt v1.6 or higher to access it. All users can define metrics using MetricFlow. Users on dbt Cloud Developer plans or dbt Core can only use it to define and test metrics locally, but can't dynamically query them with integrated tools. - +**What should you do?**

+If you've defined metrics using dbt_metrics or integrated with the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use MetricFlow or the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/sl-migration) for more info. -* **v1.3.0**: Metrics have been moved out of the experimental phase -* **v1.0.0**: Metrics are new and experimental -
+
+ + A metric is an aggregation over a that supports zero or more dimensions. Some examples of metrics include: - active users - monthly recurring revenue (mrr) -In v1.0, dbt supports metric definitions as a new node type. Like [exposures](exposures), metrics appear as nodes in the directed acyclic graph (DAG) and can be expressed in YAML files. Defining metrics in dbt projects encodes crucial business logic in tested, version-controlled code. Further, you can expose these metrics definitions to downstream tooling, which drives consistency and precision in metric reporting. - -Review the video below to learn more about metrics, why they're important, and how to get started: - - +In v1.0, dbt supports metric definitions as a new node type. Like [exposures](exposures), metrics appear as nodes in the directed acyclic graph (DAG) and can be expressed in YAML files. Defining metrics in dbt projects encodes crucial business logic in tested, version-controlled code. Further, you can expose these metrics definitions to downstream tooling, which drives consistency and precision in metric reporting. ### Benefits of defining metrics @@ -59,7 +54,7 @@ You can define metrics in `.yml` files nested under a `metrics:` key. Metric nam - begin with a letter - contain no more than 250 characters -For a short human-friendly name with title casing, spaces, and special characters, use the `label` property. More examples and guidance for how to [define and structure metrics can be found here.](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics). +For a short human-friendly name with title casing, spaces, and special characters, use the `label` property. ### Example definition @@ -218,14 +213,17 @@ Metrics can have many declared **properties**, which define aspects of your metr ### Available calculation methods + The method of calculation (aggregation or derived) that is applied to the expression. + + The type of calculation (aggregation or expression) that is applied to the sql property. -| Metric Calculation Method Metric Type | Description | +| Metric Calculation Method | Description | |----------------|----------------------------------------------------------------------------| | count | This metric type will apply the `count` aggregation to the specified field | | count_distinct | This metric type will apply the `count` aggregation to the specified field, with an additional distinct statement inside the aggregation | @@ -428,6 +426,11 @@ The following is the list of currently accepted metric configs: ## Querying Your Metric + +:::caution dbt_metrics is no longer supported +The dbt_metrics package has been deprecated and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. +::: + You can dynamically query metrics directly in dbt and verify them before running a job in the deployment environment. To query your defined metric, you must have the [dbt_metrics package](https://github.com/dbt-labs/dbt_metrics) installed. Information on how to [install packages can be found here](https://docs.getdbt.com/docs/build/packages#how-do-i-add-a-package-to-my-project). Use the following [metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/) installation code in your packages.yml file and run `dbt deps` to install the metrics package: @@ -452,16 +455,6 @@ packages:
- - -```yml -packages: - - package: dbt-labs/metrics - version: [">=0.2.0", "<0.3.0"] -``` - - - Once the package has been installed with `dbt deps`, make sure to run the `dbt_metrics_default_calendar` model as this is required for macros used to query metrics. More information on this, and additional calendar functionality, can be found in the [project README](https://github.com/dbt-labs/dbt_metrics#calendar). ### Querying metrics with `metrics.calculate` @@ -480,19 +473,6 @@ from {{ metrics.calculate( - - -```sql -select * -from {{ metrics.calculate( - metric_name='new_customers', - grain='week', - dimensions=['plan', 'country'] -) }} -``` - - - ### Supported inputs The example above doesn't display all the potential inputs you can provide to the macro. @@ -501,7 +481,7 @@ You may find some pieces of functionality, like secondary calculations, complica | Input | Example | Description | Required | | ----------- | ----------- | ----------- | -----------| -| metric_listmetric_name | `metric('some_metric)'`,
[`metric('some_metric)'`,
`metric('some_other_metric)'`]
`'metric_name'`
| The metric(s) to be queried by the macro. If multiple metrics required, provide in list format.The name of the metric | Required | +| metric_list | `metric('some_metric)'`,
[`metric('some_metric)'`,
`metric('some_other_metric)'`]
| The metric(s) to be queried by the macro. If multiple metrics required, provide in list format. | Required | | grain | `'day'`, `'week'`,
`'month'`, `'quarter'`,
`'year'`
| The time grain that the metric will be aggregated to in the returned dataset | Optional | | dimensions | [`'plan'`,
`'country'`] | The dimensions you want the metric to be aggregated by in the returned dataset | Optional | | secondary_calculations | [`metrics.period_over_period( comparison_strategy="ratio", interval=1, alias="pop_1wk")`] | Performs the specified secondary calculation on the metric results. Examples include period over period calculations, rolling calculations, and period to date calculations. | Optional | @@ -541,6 +521,7 @@ The period to date secondary calculation performs an aggregation on a defined pe #### Rolling: + The rolling secondary calculation performs an aggregation on a number of rows in metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4` then the value returned will be a rolling 4 week calculation of whatever aggregation type was selected. If the `interval` input is not provided then the rolling caclulation will be unbounded on all preceding rows. | Input | Example | Description | Required | @@ -552,6 +533,7 @@ The rolling secondary calculation performs an aggregation on a number of rows in + The rolling secondary calculation performs an aggregation on a number of rows in the metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4`, then the value returned will be a rolling 4-week calculation of whatever aggregation type was selected. | Input | Example | Description | Required | @@ -651,12 +633,6 @@ from {{ metrics.develop( - - -Functionality for `develop` is only supported in v1.2 and higher. Please navigate to those versions for information about this method of metric development. - - - #### Multiple/Derived Metrics with `metrics.develop` If you have a more complicated use case that you are interested in testing, the develop macro also supports this behavior. The only caveat is that you must include the raw tags for any provided metric yml that contains a derived metric. Example below: @@ -715,4 +691,6 @@ The above example will return a dataset that contains the metric provided in the + + diff --git a/website/docs/docs/build/models.md b/website/docs/docs/build/models.md index e0683158e6d..1cf2fbafeda 100644 --- a/website/docs/docs/build/models.md +++ b/website/docs/docs/build/models.md @@ -2,6 +2,8 @@ title: "About dbt models" description: "Read this tutorial to learn how to use models when building in dbt." id: "models" +pagination_next: "docs/build/sql-models" +pagination_prev: null --- ## Overview @@ -18,4 +20,4 @@ The top level of a dbt workflow is the project. A project is a directory of a `. Your organization may need only a few models, but more likely you’ll need a complex structure of nested models to transform the required data. A model is a single file containing a final `select` statement, and a project can have multiple models, and models can even reference each other. Add to that, numerous projects and the level of effort required for transforming complex data sets can improve drastically compared to older methods. -Learn more about models in [SQL models](/docs/build/sql-models) and [Python models](/docs/build/python-models) pages. If you'd like to begin with a bit of practice, visit our [Getting Started Guide](/quickstarts) for instructions on setting up the Jaffle_Shop sample data so you can get hands-on with the power of dbt. +Learn more about models in [SQL models](/docs/build/sql-models) and [Python models](/docs/build/python-models) pages. If you'd like to begin with a bit of practice, visit our [Getting Started Guide](/guides) for instructions on setting up the Jaffle_Shop sample data so you can get hands-on with the power of dbt. diff --git a/website/docs/docs/build/organize-your-outputs.md b/website/docs/docs/build/organize-your-outputs.md new file mode 100644 index 00000000000..ad5efeda1c7 --- /dev/null +++ b/website/docs/docs/build/organize-your-outputs.md @@ -0,0 +1,38 @@ +--- +title: "Organize your outputs" +description: "Learn how you can organize your outputs" +pagination_next: "docs/build/custom-schemas" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + + +
\ No newline at end of file diff --git a/website/docs/docs/build/packages.md b/website/docs/docs/build/packages.md index d4cebc7a6f0..8d18a55e949 100644 --- a/website/docs/docs/build/packages.md +++ b/website/docs/docs/build/packages.md @@ -3,7 +3,7 @@ title: "Packages" id: "packages" --- -## What is a package? + Software engineers frequently modularize code into libraries. These libraries help programmers operate with leverage: they can spend more time focusing on their unique business logic, and less time implementing code that someone else has already spent the time perfecting. In dbt, libraries like these are called _packages_. dbt's packages are so powerful because so many of the analytic problems we encountered are shared across organizations, for example: @@ -22,13 +22,19 @@ dbt _packages_ are in fact standalone dbt projects, with models and macros that * Models in the package will be materialized when you `dbt run`. * You can use `ref` in your own models to refer to models from the package. * You can use macros in the package in your own project. +* It's important to note that defining and installing dbt packages is different from [defining and installing Python packages](/docs/build/python-models#using-pypi-packages) -:::note Using Python packages -Defining and installing dbt packages is different from [defining and installing Python packages](/docs/build/python-models#using-pypi-packages). +:::info `dependencies.yml` has replaced `packages.yml` +Starting from dbt v1.6, `dependencies.yml` has replaced `packages.yml`. This file can now contain both types of dependencies: "package" and "project" dependencies. +- "Package" dependencies lets you add source code from someone else's dbt project into your own, like a library. +- "Project" dependencies provide a different way to build on top of someone else's work in dbt. Refer to [Project dependencies](/docs/collaborate/govern/project-dependencies) for more info. +- +You can rename `packages.yml` to `dependencies.yml`, _unless_ you need to use Jinja within your packages specification. This could be necessary, for example, if you want to add an environment variable with a git token in a private git package specification. ::: + ## How do I add a package to my project? 1. Add a file named `dependencies.yml` or `packages.yml` to your dbt project. This should be at the same level as your `dbt_project.yml` file. 2. Specify the package(s) you wish to add using one of the supported syntaxes, for example: @@ -48,11 +54,7 @@ packages:
- - -- **v1.0.0:** The default [`packages-install-path`](/reference/project-configs/packages-install-path) has been updated to be `dbt_packages` instead of `dbt_modules`. - - +The default [`packages-install-path`](/reference/project-configs/packages-install-path) is `dbt_packages`. 3. Run `dbt deps` to install the package(s). Packages get installed in the `dbt_packages` directory – by default this directory is ignored by git, to avoid duplicating the source code for the package. @@ -89,13 +91,6 @@ In comparison, other package installation methods are unable to handle the dupli #### Prerelease versions - - -* `v0.20.1`: Fixed handling for prerelease versions. Introduced `install-prerelease` parameter. -* `v1.0.0`: When you provide an explicit prerelease version, dbt will install that version. - - - Some package maintainers may wish to push prerelease versions of packages to the dbt Hub, in order to test out new functionality or compatibility with a new version of dbt. A prerelease version is demarcated by a suffix, such as `a1` (first alpha), `b2` (second beta), or `rc3` (third release candidate). By default, `dbt deps` will not include prerelease versions when resolving package dependencies. You can enable the installation of prereleases in one of two ways: @@ -130,12 +125,6 @@ packages: - - -* `v0.20.0`: Introduced the ability to specify commit hashes as package revisions - - - Add the Git URL for the package, and optionally specify a revision. The revision can be: - a branch name - a tagged release @@ -265,12 +254,6 @@ Read more about creating a Personal Access Token [here](https://confluence.atlas #### Configure subdirectory for packaged projects - - -* `v0.20.0`: Introduced the ability to specify `subdirectory` - - - In general, dbt expects `dbt_project.yml` to be located as a top-level file in a package. If the packaged project is instead nested in a subdirectory—perhaps within a much larger mono repo—you can optionally specify the folder path as `subdirectory`. dbt will attempt a [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) of just the files located within that subdirectory. Note that you must be using a recent version of `git` (`>=2.26.0`). @@ -284,18 +267,35 @@ packages: ### Local packages -Packages that you have stored locally can be installed by specifying the path to the project, like so: +A "local" package is a dbt project accessible from your local file system. You can install it by specifying the project's path. It works best when you nest the project within a subdirectory relative to your current project's directory. + + + +```yaml +packages: + - local: relative/path/to/subdirectory +``` + + + +Other patterns may work in some cases, but not always. For example, if you install this project as a package elsewhere, or try running it on a different system, the relative and absolute paths will yield the same results. ```yaml packages: - - local: /opt/dbt/redshift # use a local path + # not recommended - support for these patterns vary + - local: /../../redshift # relative path to a parent directory + - local: /opt/dbt/redshift # absolute path on the system ``` -Local packages should only be used for specific situations, for example, when testing local changes to a package. +There are a few specific use cases where we recommend using a "local" package: +1. **Monorepo** — When you have multiple projects, each nested in a subdirectory, within a monorepo. "Local" packages allow you to combine projects for coordinated development and deployment. +2. **Testing changes** — To test changes in one project or package within the context of a downstream project or package that uses it. By temporarily switching the installation to a "local" package, you can make changes to the former and immediately test them in the latter for quicker iteration. This is similar to [editable installs](https://pip.pypa.io/en/stable/topics/local-project-installs/) in Python. +3. **Nested project** — When you have a nested project that defines fixtures and tests for a project of utility macros, like [the integration tests within the `dbt-utils` package](https://github.com/dbt-labs/dbt-utils/tree/main/integration_tests). + ## What packages are available? Check out [dbt Hub](https://hub.getdbt.com) to see the library of published dbt packages! @@ -372,3 +372,4 @@ packages: ``` + diff --git a/website/docs/docs/build/project-variables.md b/website/docs/docs/build/project-variables.md index a69132d6a3b..59d6be49b17 100644 --- a/website/docs/docs/build/project-variables.md +++ b/website/docs/docs/build/project-variables.md @@ -1,6 +1,7 @@ --- title: "Project variables" id: "project-variables" +pagination_next: "docs/build/environment-variables" --- dbt provides a mechanism, [variables](/reference/dbt-jinja-functions/var), to provide data to models for @@ -27,7 +28,7 @@ Jinja is not supported within the `vars` config, and all values will be interpre :::info New in v0.17.0 The syntax for specifying vars in the `dbt_project.yml` file has changed in -dbt v0.17.0. See the [migration guide](/guides/migration/versions) +dbt v0.17.0. See the [migration guide](/docs/dbt-versions/core-upgrade) for more information on these changes. ::: diff --git a/website/docs/docs/build/projects.md b/website/docs/docs/build/projects.md index a7ca3638590..a54f6042cce 100644 --- a/website/docs/docs/build/projects.md +++ b/website/docs/docs/build/projects.md @@ -1,6 +1,8 @@ --- title: "About dbt projects" id: "projects" +pagination_next: null +pagination_prev: null --- A dbt project informs dbt about the context of your project and how to transform your data (build your data sets). By design, dbt enforces the top-level structure of a dbt project such as the `dbt_project.yml` file, the `models` directory, the `snapshots` directory, and so on. Within the directories of the top-level, you can organize your project in any way that meets the needs of your organization and data pipeline. @@ -18,6 +20,7 @@ At a minimum, all a project needs is the `dbt_project.yml` project configuration | [sources](/docs/build/sources) | A way to name and describe the data loaded into your warehouse by your Extract and Load tools. | | [exposures](/docs/build/exposures) | A way to define and describe a downstream use of your project. | | [metrics](/docs/build/metrics) | A way for you to define metrics for your project. | +| [groups](/docs/build/groups) | Groups enable collaborative node organization in restricted collections. | | [analysis](/docs/build/analyses) | A way to organize analytical SQL queries in your project such as the general ledger from your QuickBooks. | When building out the structure of your project, you should consider these impacts on your organization's workflow: @@ -76,7 +79,7 @@ After configuring the Project subdirectory option, dbt Cloud will use it as the You can create new projects and [share them](/docs/collaborate/git-version-control) with other people by making them available on a hosted git repository like GitHub, GitLab, and BitBucket. -After you set up a connection with your data platform, you can [initialize your new project in dbt Cloud](/quickstarts) and start developing. Or, run [dbt init from the command line](/reference/commands/init) to set up your new project. +After you set up a connection with your data platform, you can [initialize your new project in dbt Cloud](/guides) and start developing. Or, run [dbt init from the command line](/reference/commands/init) to set up your new project. During project initialization, dbt creates sample model files in your project directory to help you start developing quickly. @@ -88,6 +91,6 @@ If you want to see what a mature, production project looks like, check out the [ ## Related docs -* [Best practices: How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) -* [Quickstarts for dbt Cloud](/quickstarts) -* [Quickstart for dbt Core](/quickstarts/manual-install) +* [Best practices: How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) +* [Quickstarts for dbt Cloud](/guides) +* [Quickstart for dbt Core](/guides/manual-install) diff --git a/website/docs/docs/build/python-models.md b/website/docs/docs/build/python-models.md index 5b9222ad1c5..3fe194a4cb7 100644 --- a/website/docs/docs/build/python-models.md +++ b/website/docs/docs/build/python-models.md @@ -16,11 +16,15 @@ We encourage you to: dbt Python (`dbt-py`) models can help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open-source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. Python transformations defined in dbt are models in your project with all the same capabilities around testing, documentation, and lineage. + Python models are supported in dbt Core 1.3 and higher. Learn more about [upgrading your version in dbt Cloud](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions) and [upgrading dbt Core versions](https://docs.getdbt.com/docs/core-versions#upgrading-to-new-patch-versions). To read more about Python models, change the [docs version to 1.3](/docs/build/python-models?version=1.3) (or higher) in the menu bar. + + + @@ -63,7 +67,7 @@ models: - not_null tests: # Write your own validation logic (in SQL) for Python results - - [custom_generic_test](/guides/best-practices/writing-custom-generic-tests) + - [custom_generic_test](/best-practices/writing-custom-generic-tests) ``` @@ -146,7 +150,7 @@ with upstream_python_model as ( :::caution -Referencing [ephemeral](docs/build/materializations#ephemeral) models is currently not supported (see [feature request](https://github.com/dbt-labs/dbt-core/issues/7288)) +Referencing [ephemeral](/docs/build/materializations#ephemeral) models is currently not supported (see [feature request](https://github.com/dbt-labs/dbt-core/issues/7288)) ::: ## Configuring Python models @@ -711,3 +715,5 @@ You can also install packages at cluster creation time by [defining cluster prop
+ + diff --git a/website/docs/docs/build/ratio-metrics.md b/website/docs/docs/build/ratio-metrics.md index d70815f140d..97efe0f55bf 100644 --- a/website/docs/docs/build/ratio-metrics.md +++ b/website/docs/docs/build/ratio-metrics.md @@ -6,40 +6,60 @@ sidebar_label: Ratio tags: [Metrics, Semantic Layer] --- -Ratio allows you to create a ratio between two measures. You simply specify a numerator and a denominator measure. Additionally, you can apply a dimensional filter to both the numerator and denominator using a constraint string when computing the metric. +Ratio allows you to create a ratio between two metrics. You simply specify a numerator and a denominator metric. Additionally, you can apply a dimensional filter to both the numerator and denominator using a constraint string when computing the metric. + + The parameters, description, and type for ratio metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `numerator` | The name of the metric used for the numerator, or structure of properties. | Required | +| `denominator` | The name of the metric used for the denominator, or structure of properties. | Required | +| `filter` | Optional filter for the numerator or denominator. | Optional | +| `alias` | Optional alias for the numerator or denominator. | Optional | + +The following displays the complete specification for ratio metrics, along with an example. ```yaml -# Ratio Metric - metrics: - - name: cancellation_rate - owners: - - support@getdbt.com - type: ratio # Ratio metrics create a ratio out of two measures. Define the measures from the semantic model as numerator or denominator - type_params: - numerator: cancellations_usd - denominator: transaction_amount_usd - filter: | # add optional constraint string. This applies to both the numerator and denominator - {{ dimension('country', entity_path=['customer']) }} = 'MX' +metrics: + - name: The metric name # Required + description: the metric description # Optional + type: ratio # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + numerator: The name of the metric used for the numerator, or structure of properties # Required + name: Name of metric used for the numerator # Required + filter: Filter for the numerator # Optional + alias: Alias for the numerator # Optional + denominator: The name of the metric used for the denominator, or structure of properties # Required + name: Name of metric used for the denominator # Required + filter: Filter for the denominator # Optional + alias: Alias for the denominator # Optional +``` + +## Ratio metrics example - - name: enterprise_cancellation_rate - owners: - - support@getdbt.com - type: ratio # Ratio metrics create a ratio out of two measures. Define the measures from the semantic model as numerator or denominator - type_params: - numerator: - name: cancellations_usd - filter: tier = 'enterprise' #constraint only applies to the numerator - denominator: transaction_amount_usd - filter: | # add optional constraint string. This applies to both the numerator and denominator - {{ dimension('country', entity_path=['customer']) }} = 'MX' +```yaml +metrics: + - name: food_order_pct + description: "The food order count as a ratio of the total order count" + label: Food Order Ratio + type: ratio + type_params: + numerator: food_orders + denominator: orders ``` -### Different semantic models +## Ratio metrics using different semantic models -If the numerator and denominator in a ratio metric come from different semantic models, the system will compute their values in subqueries and then join the result set based on common dimensions to calculate the final ratio. Here's an example of the generated SQL for such a ratio metric. +The system will simplify and turn the numerator and denominator in a ratio metric from different semantic models by computing their values in sub-queries. It will then join the result set based on common dimensions to calculate the final ratio. Here's an example of the SQL generated for such a ratio metric. -```SQL +```sql select subq_15577.metric_time as metric_time , cast(subq_15577.mql_queries_created_test as double) / cast(nullif(subq_15582.distinct_query_users, 0) as double) as mql_queries_per_active_user @@ -83,9 +103,9 @@ on ) ``` -### Add filter +## Add filter -Users can define constraints on input measures for a metric by applying a filter directly to the measure, like so: +Users can define constraints on input metrics for a ratio metric by applying a filter directly to the input metric, like so: ```yaml metrics: @@ -97,10 +117,11 @@ metrics: type_params: numerator: name: distinct_purchasers - filter: {{dimension('is_frequent_purchaser')}} + filter: | + {{Dimension('customer__is_frequent_purchaser')}} alias: frequent_purchasers denominator: name: distinct_purchasers ``` -Note the `filter` and `alias` parameters for the measure referenced in the numerator. Use the `filter` parameter to apply a filter to the measure it's attached to. The `alias` parameter is used to avoid naming conflicts in the rendered SQL queries when the same measure is used with different filters. If there are no naming conflicts, the `alias` parameter can be left out. +Note the `filter` and `alias` parameters for the metric referenced in the numerator. Use the `filter` parameter to apply a filter to the metric it's attached to. The `alias` parameter is used to avoid naming conflicts in the rendered SQL queries when the same metric is used with different filters. If there are no naming conflicts, the `alias` parameter can be left out. diff --git a/website/docs/docs/build/saved-queries.md b/website/docs/docs/build/saved-queries.md new file mode 100644 index 00000000000..7b88a052726 --- /dev/null +++ b/website/docs/docs/build/saved-queries.md @@ -0,0 +1,38 @@ +--- +title: Saved queries +id: saved-queries +description: "Saved queries are a way to save commonly used queries in MetricFlow. They can be used to save time and avoid writing the same query over and over again." +sidebar_label: "Saved queries" +tags: [Metrics, Semantic Layer] +--- + +Saved queries are a way to save commonly used queries in MetricFlow. You can group metrics, dimensions, and filters that are logically related into a saved query. + +To define a saved query, refer to the following specification: + + Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `query_params` | The query parameters for the saved query: `metrics`, `group_by`, and `where`. | Required | + +The following is an example of a saved query: + +```yaml +saved_queries: + name: p0_booking + description: Booking-related metrics that are of the highest priority. + query_params: + metrics: + - bookings + - instant_bookings + group_by: + - TimeDimension('metric_time', 'day') + - Dimension('listing__capacity_latest') + where: + - "{{ Dimension('listing__capacity_latest') }} > 3" +``` + +### FAQs + +* All metrics in a saved query need to use the same dimensions in the `group_by` or `where` clauses. diff --git a/website/docs/docs/build/semantic-models.md b/website/docs/docs/build/semantic-models.md index 128380c2001..09f808d7a17 100644 --- a/website/docs/docs/build/semantic-models.md +++ b/website/docs/docs/build/semantic-models.md @@ -6,6 +6,7 @@ keywords: - dbt metrics layer sidebar_label: Semantic models tags: [Metrics, Semantic Layer] +pagination_next: "docs/build/dimensions" --- Semantic models are the foundation for data definition in MetricFlow, which powers the dbt Semantic Layer: @@ -23,12 +24,15 @@ Semantic models have 6 components and this page explains the definitions with so | Component | Description | Type | | --------- | ----------- | ---- | -| [Name](#name) | Unique name for the semantic model | Required | +| [Name](#name) | Choose a unique name for the semantic model. Avoid using double underscores (__) in the name as they're not supported. | Required | | [Description](#description) | Includes important details in the description | Optional | | [Model](#model) | Specifies the dbt model for the semantic model using the `ref` function | Required | +| [Defaults](#defaults) | The defaults for the model, currently only `agg_time_dimension` is supported. | Required | | [Entities](#entities) | Uses the columns from entities as join keys and indicate their type as primary, foreign, or unique keys with the `type` parameter | Required | -| [Dimensions](#dimensions) | Different ways to group or slice data for a metric, they can be `time-based` or `categorical` | Required | +| [Primary Entity](#primary-entity) | If a primary entity exists, this component is Optional. If the semantic model has no primary entity, then this property is required. | Optional | +| [Dimensions](#dimensions) | Different ways to group or slice data for a metric, they can be `time` or `categorical` | Required | | [Measures](#measures) | Aggregations applied to columns in your data model. They can be the final metric or used as building blocks for more complex metrics | Optional | +| Label | The display name for your semantic model `node`, `dimension`, `entity`, and/or `measures` | Optional | ## Semantic models components @@ -47,11 +51,13 @@ semantic_models: - see more information in measures section dimensions: ## Required - see more information in dimensions section + primary_entity: >- + if the semantic model has no primary entity, then this property is required. #Optional if a primary entity exists, otherwise Required ``` The following example displays a complete configuration and detailed descriptions of each field: -```yml +```yaml semantic_models: - name: transaction # A semantic model with the name Transactions model: ref('fact_transactions') # References the dbt model named `fact_transactions` @@ -67,7 +73,6 @@ semantic_models: type: foreign expr: customer_id - dimensions: # dimensions are qualitative values such as names, dates, or geographical data. They provide context to metrics and allow "metric by group" data slicing. - name: transaction_date type: time @@ -107,9 +112,36 @@ semantic_models: type: categorical ``` + + +Semantic models support configs in either the schema file or at the project level. + +Semantic model config in `models/semantic.yml`: +```yml +semantic_models: + - name: orders + config: + enabled: true | false + group: some_group + meta: + some_key: some_value +``` + +Semantic model config in `dbt_project.yml`: +```yml +semantic-models: + my_project_name: + +enabled: true | false + +group: some_group + +meta: + some_key: some_value +``` + + + ### Name -Define the name of the semantic model. You must define a unique name for the semantic model. The semantic graph will use this name to identify the model, and you can update it at any time. +Define the name of the semantic model. You must define a unique name for the semantic model. The semantic graph will use this name to identify the model, and you can update it at any time. Avoid using double underscores (__) in the name as they're not supported. ### Description @@ -119,10 +151,33 @@ Includes important details in the description of the semantic model. This descri Specify the dbt model for the semantic model using the [`ref` function](/reference/dbt-jinja-functions/ref). +### Defaults + +Defaults for the semantic model. Currently only `agg_time_dimension`. `agg_time_dimension` represents the default time dimensions for measures. This can be overridden by adding the `agg_time_dimension` key directly to a measure - see [Dimensions](/docs/build/dimensions) for examples. ### Entities To specify the [entities](/docs/build/entities) in your model, use their columns as join keys and indicate their `type` as primary, foreign, or unique keys with the type parameter. +### Primary entity + +MetricFlow requires that all dimensions be tied to an entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. + +You can define a primary entity using the following configs: + +```yaml +semantic_model: + name: bookings_monthly_source + description: bookings_monthly_source + defaults: + agg_time_dimension: ds + model: ref('bookings_monthly_source') + measures: + - name: bookings_monthly + agg: sum + create_metric: true + primary_entity: booking_id + ``` + @@ -142,7 +197,7 @@ This example shows a semantic model with three entities and their entity types: To reference a desired column, use the actual column name from the model in the `name` parameter. You can also use `name` as an alias to rename the column, and the `expr` parameter to refer to the original column name or a SQL expression of the column. -```yml +```yaml entity: - name: transaction type: primary @@ -165,7 +220,7 @@ You can refer to entities (join keys) in a semantic model using the `name` param MetricFlow simplifies this by allowing you to query all metric groups and construct the join during the query. To specify dimensions parameters, include the `name` (either a column or SQL expression) and `type` (`categorical` or `time`). Categorical groups represent qualitative values, while time groups represent dates of varying granularity. -Dimensions are identified using the name parameter, just like identifiers. The naming of groups must be unique within a semantic model, but not across semantic models since MetricFlow, uses entities to determine the appropriate groups. +Dimensions are identified using the name parameter, just like identifiers. The naming of groups must be unique within a semantic model, but not across semantic models since MetricFlow, uses entities to determine the appropriate groups. MetricFlow requires all dimensions be tied to a primary entity. :::info For time groups @@ -184,6 +239,7 @@ import MeasuresParameters from '/snippets/_sl-measures-parameters.md'; + import SetUpPages from '/snippets/_metrics-dependencies.md'; diff --git a/website/docs/docs/build/simple.md b/website/docs/docs/build/simple.md index 0092427699d..1803e952a69 100644 --- a/website/docs/docs/build/simple.md +++ b/website/docs/docs/build/simple.md @@ -4,9 +4,36 @@ id: simple description: "Use simple metrics to directly reference a single measure." sidebar_label: Simple tags: [Metrics, Semantic Layer] +pagination_next: null --- -Simple metrics are metrics that directly reference a single measure, without any additional measures involved. +Simple metrics are metrics that directly reference a single measure, without any additional measures involved. They are aggregations over a column in your data platform and can be filtered by one or multiple dimensions. + + The parameters, description, and type for simple metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `measure` | The measure you're referencing. | Required | + +The following displays the complete specification for simple metrics, along with an example. + + +```yaml +metrics: + - name: The metric name # Required + description: the metric description # Optional + type: simple # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + measure: The measure you're referencing # Required + +``` + -``` yaml -metrics: - - name: cancellations - type: simple # Pointers to a measure you created in a data source - type_params: - measure: cancellations_usd # The measure you're creating a proxy of. - # For any metric optionally include a filter string which applies a dimensional filter when computing the metric - filter: | - {{dimension('value')}} > 100 and {{dimension('acquisition', entity_path=['user'])}} +## Simple metrics example + +```yaml + metrics: + - name: customers + description: Count of customers + type: simple # Pointers to a measure you created in a semantic model + label: Count of customers + type_params: + measure: customers # The measure youre creating a proxy of. + - name: large_orders + description: "Order with order values over 20." + type: SIMPLE + label: Large Orders + type_params: + measure: orders + filter: | # For any metric you can optionally include a filter on dimension values + {{Dimension('customer__order_total_dim')}} >= 20 ``` diff --git a/website/docs/docs/build/sl-getting-started.md b/website/docs/docs/build/sl-getting-started.md index 29134b3cf59..d5a59c33ec2 100644 --- a/website/docs/docs/build/sl-getting-started.md +++ b/website/docs/docs/build/sl-getting-started.md @@ -4,133 +4,97 @@ title: Get started with MetricFlow description: "Learn how to create your first semantic model and metric." sidebar_label: Get started with MetricFlow tags: [Metrics, Semantic Layer] +meta: + api_name: dbt Semantic Layer APIs --- -This getting started page recommends a workflow to help you get started creating your first metrics. Here are the following steps you'll take: +import CreateModel from '/snippets/_sl-create-semanticmodel.md'; +import DefineMetrics from '/snippets/_sl-define-metrics.md'; +import ConfigMetric from '/snippets/_sl-configure-metricflow.md'; +import TestQuery from '/snippets/_sl-test-and-query-metrics.md'; +import ConnectQueryAPI from '/snippets/_sl-connect-and-query-api.md'; +import RunProdJob from '/snippets/_sl-run-prod-job.md'; -- [Create a semantic model](#create-a-semantic-model) -- [Create your metrics](#create-your-metrics) -- [Test and query your metrics](#test-and-query-your-metrics) +This getting started page presents a sample workflow to help you create your first metrics in dbt Cloud or the command line interface (CLI). It uses the [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) as the project data source and is available for you to use. + +If you prefer, you can create semantic models and metrics for your own dbt project. This page will guide you on how to: + +- [Create a semantic model](#create-a-semantic-model) using MetricFlow +- [Define metrics](#define-metrics) using MetricFlow +- [Test and query metrics](#test-and-query-metrics) using MetricFlow +- [Run a production job](#run-a-production-job) in dbt Cloud +- [Set up dbt Semantic Layer](#set-up-dbt-semantic-layer) in dbt Cloud +- [Connect to and query the API](#connect-and-query-api) with dbt Cloud + +MetricFlow allows you to define metrics in your dbt project and query them whether in dbt Cloud or dbt Core with [MetricFlow commands](/docs/build/metricflow-commands). + +However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. ## Prerequisites -- Use the [command line (CLI)](/docs/core/about-the-cli) and have a dbt project and repository set up. - * Note: Support for dbt Cloud and integrations coming soon. -- Your dbt production environment must be on [dbt Core v1.6](/docs/dbt-versions/core) or higher. Support for the development environment coming soon. -- Have a dbt project connected to Snowflake or Postgres. - * Note: Support for BigQuery, Databricks, and Redshift coming soon. -- Have an understanding of key concepts in [MetricFlow](/docs/build/about-metricflow), which powers the revamped dbt Semantic Layer. -- Recommended — Install the [MetricFlow CLI](/docs/build/metricflow-cli) to query and test your metrics. +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + :::tip New to dbt or metrics? Try our [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) to help you get started! ::: -## Install MetricFlow - -Before you begin, install the [MetricFlow CLI](/docs/build/metricflow-cli) as an extension of a dbt adapter from PyPI. The MetricFlow CLI is compatible with Python versions 3.8, 3.9, 3.10 and 3.11 +## Create a semantic model -Use pip install `metricflow` and your [dbt adapter](/docs/supported-data-platforms): + -- Create or activate your virtual environment. `python -m venv venv` -- `pip install "dbt-metricflow[your_adapter_name]"` - * You must specify `[your_adapter_name]`. For example, run `pip install "dbt-metricflow[snowflake]"` if you use a Snowflake adapter. - -Currently, the supported adapters are Snowflake and Postgres (BigQuery, Databricks, and Redshift coming soon). +## Define metrics -## Create a semantic model - -MetricFlow, which powers the dbt Semantic Layer, has two main objects: [semantic models](/docs/build/semantic-models) and [metrics](/docs/build/metrics-overview). You can think of semantic models as nodes in your semantic graph, connected via entities as edges. MetricFlow takes semantic models defined in YAML configuration files as inputs and creates a semantic graph that you can use to query metrics. - -This step will guide you through setting up your semantic models, which consists of [entities](/docs/build/entities), [dimensions](/docs/build/dimensions), and [measures](/docs/build/measures). - -1. Name your semantic model, fill in appropriate metadata, and map it to a model in your dbt project. -```yaml -semantic_models: - - name: transactions - description: | - This table captures every transaction starting July 02, 2014. Each row represents one transaction - model: ref('fact_transactions') - ``` - -2. Define your entities. These are the keys in your table that MetricFlow will use to join other semantic models. These are usually columns like `customer_id`, `transaction_id`, and so on. - -```yaml -entities: - - name: transaction - type: primary - expr: id_transaction - - name: customer - type: foreign - expr: id_customer - ``` - -3. Define your dimensions and measures. dimensions are properties of the records in your table that are non-aggregatable. They provide categorical or time-based context to enrich metrics. Measures are the building block for creating metrics. They are numerical columns that MetricFlow aggregates to create metrics. - -```yaml -measures: - - name: transaction_amount_usd - description: The total USD value of the transaction. - agg: sum -dimensions: - - name: is_large - type: categorical - expr: case when transaction_amount_usd >= 30 then true else false end -``` - -:::tip -If you're familiar with writing SQL, you can think of dimensions as the columns you would group by and measures as the columns you would aggregate. -```sql -select - metric_time_day, -- time - country, -- categorical dimension - sum(revenue_usd) -- measure -from - snowflake.fact_transactions -- sql table -group by metric_time_day, country -- dimensions - ``` -::: + -## Create your metrics +## Configure the MetricFlow time spine model -Now that you've created your first semantic model, it's time to define your first metric. MetricFlow supports different metric types like [simple](/docs/build/simple), [ratio](/docs/build/ratio), [cumulative](/docs/build/cumulative), and [derived](/docs/build/derived). You can define metrics in the same YAML files as your semantic models, or create a new file. + -The example metric we'll create is a simple metric that refers directly to a measure, based on the `transaction_amount_usd` measure, which will be implemented as a `sum()` function in SQL. +## Test and query metrics -```yaml -metrics: - - name: transaction_amount_usd - type: simple - type_params: null - measure: transaction_amount_usd -``` + -Interact and test your metric using the CLI before committing it to your MetricFlow repository. +## Run a production job -## Test and query your metrics + -Follow these steps to test and query your metrics using MetricFlow: +## Set up dbt Semantic Layer -1. If you haven't done so already, make sure you [install MetricFlow](#install-metricflow). Refer to [MetricFlow CLI](/docs/build/metricflow-cli) for more info on commands and how to install the CLI. +import SlSetUp from '/snippets/_new-sl-setup.md'; -2. Run `mf --help` to confirm you have MetricFlow installed, and to see the available commands. If you don't have the CLI installed, run `pip install --upgrade "dbt-metricflow[your_adapter_name]"`. For example, if you have a Snowflake adapter, run `pip install --upgrade "dbt-metricflow[snowflake]"`. + -3. Save your files and run `mf validate-configs` to validate the changes before committing them +## Connect and query API -4. Run `mf query --metrics --group-by ` to query the metrics and dimensions you want to see in the CLI. + -5. Verify that the metric values are what you expect. You can view the generated SQL if you enter `--explain` in the CLI. +## FAQs -6. Then commit your changes to push them to your git repo. +If you're encountering some issues when defining your metrics or setting up the dbt Semantic Layer, check out a list of answers to some of the questions or problems you may be experiencing. + +
+ How do I migrate from the legacy Semantic Layer to the new one? +
+
If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
+
+
+
+How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
- -## Related docs +## Next steps -- [The dbt Semantic Layer: what’s next](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/) blog post - [About MetricFlow](/docs/build/about-metricflow) -- [Semantic models](/docs/build/semantic-models) -- [Metrics](/docs/build/metrics-overview) -- [MetricFlow CLI](/docs/build/metricflow-cli) +- [Build your metrics](/docs/build/build-metrics-intro) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) +- Demo on [how to define and query metrics with MetricFlow](https://www.loom.com/share/60a76f6034b0441788d73638808e92ac?sid=861a94ac-25eb-4fd8-a310-58e159950f5a) +- [Billing](/docs/cloud/billing) diff --git a/website/docs/docs/build/sql-models.md b/website/docs/docs/build/sql-models.md index 65fdd58adf0..237ac84c0c2 100644 --- a/website/docs/docs/build/sql-models.md +++ b/website/docs/docs/build/sql-models.md @@ -14,7 +14,7 @@ id: "sql-models" :::info Building your first models -If you're new to dbt, we recommend that you read a [quickstart guide](/quickstarts) to build your first dbt project with models. +If you're new to dbt, we recommend that you read a [quickstart guide](/guides) to build your first dbt project with models. ::: diff --git a/website/docs/docs/build/tests.md b/website/docs/docs/build/tests.md index 1a40dd42b53..3d86dc6a81b 100644 --- a/website/docs/docs/build/tests.md +++ b/website/docs/docs/build/tests.md @@ -1,10 +1,12 @@ --- title: "Add tests to your DAG" -sidebar_title: "Tests" +sidebar_label: "Tests" description: "Read this tutorial to learn how to use tests when building in dbt." +search_weight: "heavy" id: "tests" +keywords: + - test, tests, testing, dag --- - ## Related reference docs * [Test command](/reference/commands/test) * [Test properties](/reference/resource-properties/tests) @@ -17,11 +19,7 @@ Tests are assertions you make about your models and other resources in your dbt You can use tests to improve the integrity of the SQL in each model by making assertions about the results generated. Out of the box, you can test whether a specified column in a model only contains non-null values, unique values, or values that have a corresponding value in another model (for example, a `customer_id` for an `order` corresponds to an `id` in the `customers` model), and values from a specified list. You can extend tests to suit business logic specific to your organization – any assertion that you can make about your model in the form of a select query can be turned into a test. - - -* `v0.20.0`: Both types of tests return a set of failing records. Previously, generic/schema tests returned a numeric value representing failures. Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. - - +Both types of tests return a set of failing records. Previously, generic/schema tests returned a numeric value representing failures. Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. Like almost everything in dbt, tests are SQL queries. In particular, they are `select` statements that seek to grab "failing" records, ones that disprove your assertion. If you assert that a column is unique in a model, the test query selects for duplicates; if you assert that a column is never null, the test seeks after nulls. If the test returns zero failing rows, it passes, and your assertion has been validated. @@ -32,7 +30,7 @@ There are two ways of defining tests in dbt: Defining tests is a great way to confirm that your code is working correctly, and helps prevent regressions when your code changes. Because you can use them over and over again, making similar assertions with minor variations, generic tests tend to be much more common—they should make up the bulk of your dbt testing suite. That said, both ways of defining tests have their time and place. :::tip Creating your first tests -If you're new to dbt, we recommend that you check out our [quickstart guide](/quickstarts) to build your first dbt project with models and tests. +If you're new to dbt, we recommend that you check out our [quickstart guide](/guides) to build your first dbt project with models and tests. ::: ## Singular tests @@ -114,7 +112,7 @@ You can find more information about these tests, and additional configurations ( ### More generic tests -Those four tests are enough to get you started. You'll quickly find you want to use a wider variety of tests—a good thing! You can also install generic tests from a package, or write your own, to use (and reuse) across your dbt project. Check out the [guide on custom generic tests](/guides/best-practices/writing-custom-generic-tests) for more information. +Those four tests are enough to get you started. You'll quickly find you want to use a wider variety of tests—a good thing! You can also install generic tests from a package, or write your own, to use (and reuse) across your dbt project. Check out the [guide on custom generic tests](/best-practices/writing-custom-generic-tests) for more information. :::info There are generic tests defined in some open source packages, such as [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and [dbt-expectations](https://hub.getdbt.com/calogica/dbt_expectations/latest/) — skip ahead to the docs on [packages](/docs/build/packages) to learn more! @@ -165,7 +163,7 @@ Done. PASS=2 WARN=0 ERROR=0 SKIP=0 TOTAL=2 ``` 3. Check out the SQL dbt is running by either: * **dbt Cloud:** checking the Details tab. - * **dbt CLI:** checking the `target/compiled` directory + * **dbt Core:** checking the `target/compiled` directory **Unique test** @@ -243,13 +241,7 @@ where {{ column_name }} is null ## Storing test failures - - -* `v0.20.0`: Introduced storing test failures in the database - - - -Normally, a test query will calculate failures as part of its execution. If you set the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures), dbt will first save the results of a test query to a table in the database, and then query that table to calculate the number of failures. +Normally, a test query will calculate failures as part of its execution. If you set the optional `--store-failures` flag, the [`store_failures`](/reference/resource-configs/store_failures), or the [`store_failures_as`](/reference/resource-configs/store_failures_as) configs, dbt will first save the results of a test query to a table in the database, and then query that table to calculate the number of failures. This workflow allows you to query and examine failing records much more quickly in development: diff --git a/website/docs/docs/build/validation.md b/website/docs/docs/build/validation.md index 808d054f021..02ce48729a4 100644 --- a/website/docs/docs/build/validation.md +++ b/website/docs/docs/build/validation.md @@ -12,16 +12,14 @@ These validations ensure that configuration files follow the expected schema, th The code that handles validation [can be found here](https://github.com/dbt-labs/dbt-semantic-interfaces/tree/main/dbt_semantic_interfaces/validations) for those who want to dive deeper into this topic. -## Prerequisites - -- You have installed the [MetricFlow CLI package](https://github.com/dbt-labs/metricflow) ## Validations command -You can run validations from the CLI with the following commands: +You can run validations from dbt Cloud or the command line with the following [MetricFlow commands](/docs/build/metricflow-commands): ```bash -mf validate-configs +dbt sl validate-configs # dbt Cloud users +mf validate-configs # dbt Core users ``` ## Parsing diff --git a/website/docs/docs/building-a-dbt-project/building-models/python-models.md b/website/docs/docs/building-a-dbt-project/building-models/python-models.md deleted file mode 100644 index 1aab8ac7a92..00000000000 --- a/website/docs/docs/building-a-dbt-project/building-models/python-models.md +++ /dev/null @@ -1,719 +0,0 @@ ---- -title: "Python models" ---- - -:::info Brand new! - -dbt Core v1.3 included first-ever support for Python models. Note that only [specific data platforms](#specific-data-platforms) support dbt-py models. - -We encourage you to: -- Read [the original discussion](https://github.com/dbt-labs/dbt-core/discussions/5261) that proposed this feature. -- Contribute to [best practices for developing Python models in dbt](https://discourse.getdbt.com/t/dbt-python-model-dbt-py-best-practices/5204 ). -- Weigh in on [next steps for Python models, beyond v1.3](https://github.com/dbt-labs/dbt-core/discussions/5742). -- Join the **#dbt-core-python-models** channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). - -Below, you'll see sections entitled "❓ **Our questions**." We are excited to have released a first narrow set of functionality in v1.3, which will solve real use cases. We also know this is a first step into a much wider field of possibility. We don't pretend to have all the answers. We're excited to keep developing our opinionated recommendations and next steps for product development—and we want your help. Comment in the GitHub discussions; leave thoughts in Slack; bring up dbt + Python in casual conversation with colleagues and friends. -::: - -## About Python models in dbt - -dbt Python ("dbt-py") models will help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. By defining your Python transformations in dbt, they're just models in your project, with all the same capabilities around testing, documentation, and lineage. - - - -Python models are supported in dbt Core 1.3 and above. Learn more about [upgrading your version in dbt Cloud](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions) and [upgrading dbt Core versions](https://docs.getdbt.com/docs/core-versions#upgrading-to-new-patch-versions). - -To read more about Python models, change the docs version to 1.3 or higher in the menu above. - - - - - - - - -```python -import ... - -def model(dbt, session): - - my_sql_model_df = dbt.ref("my_sql_model") - - final_df = ... # stuff you can't write in SQL! - - return final_df -``` - - - - - -```yml -version: 2 - -models: - - name: my_python_model - - # Document within the same codebase - description: My transformation written in Python - - # Configure in ways that feel intuitive and familiar - config: - materialized: table - tags: ['python'] - - # Test the results of my Python transformation - columns: - - name: id - # Standard validation for 'grain' of Python results - tests: - - unique - - not_null - tests: - # Write your own validation logic (in SQL) for Python results - - [custom_generic_test](writing-custom-generic-tests) -``` - - - - - - -The prerequisites for dbt Python models include using an adapter for a data platform that supports a fully featured Python runtime. In a dbt Python model, all Python code is executed remotely on the platform. None of it is run by dbt locally. We believe in clearly separating _model definition_ from _model execution_. In this and many other ways, you'll find that dbt's approach to Python models mirrors its longstanding approach to modeling data in SQL. - -We've written this guide assuming that you have some familiarity with dbt. If you've never before written a dbt model, we encourage you to start by first reading [dbt Models](/docs/build/models). Throughout, we'll be drawing connections between Python models and SQL models, as well as making clear their differences. - -### What is a Python model? - -A dbt Python model is a function that reads in dbt sources or other models, applies a series of transformations, and returns a transformed dataset. DataFrame operations define the starting points, the end state, and each step along the way. - -This is similar to the role of CTEs in dbt SQL models. We use CTEs to pull in upstream datasets, define (and name) a series of meaningful transformations, and end with a final `select` statement. You can run the compiled version of a dbt SQL model to see the data included in the resulting view or table. When you `dbt run`, dbt wraps that query in `create view`, `create table`, or more complex DDL to save its results in the database. - -Instead of a final `select` statement, each Python model returns a final DataFrame. Each DataFrame operation is "lazily evaluated." In development, you can preview its data, using methods like `.show()` or `.head()`. When you run a Python model, the full result of the final DataFrame will be saved as a table in your data warehouse. - -dbt Python models have access to almost all of the same configuration options as SQL models. You can test them, document them, add `tags` and `meta` properties to them, grant access to their results to other users, and so on. You can select them by their name, their file path, their configurations, whether they are upstream or downstream of another model, or whether they have been modified compared to a previous project state. - -### Defining a Python model - -Each Python model lives in a `.py` file in your `models/` folder. It defines a function named **`model()`**, which takes two parameters: -- **`dbt`**: A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG. -- **`session`**: A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames, and to write DataFrames back to tables. In PySpark, by convention, the `SparkSession` is named `spark`, and available globally. For consistency across platforms, we always pass it into the `model` function as an explicit argument called `session`. - -The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame. Via PySpark (Databricks + BigQuery), this can be a Spark, pandas, or pandas-on-Spark DataFrame. For more about choosing between pandas and native DataFrames, see [DataFrame API + syntax](#dataframe-api--syntax). - -When you `dbt run --select python_model`, dbt will prepare and pass in both arguments (`dbt` and `session`). All you have to do is define the function. This is how every single Python model should look: - - - -```python -def model(dbt, session): - - ... - - return final_df -``` - - - - -### Referencing other models - -Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. Use the `dbt.ref()` method within a Python model to read in data from other models (SQL or Python). If you want to read directly from a raw source table, use `dbt.source()`. These methods return DataFrames pointing to the upstream source, model, seed, or snapshot. - - - -```python -def model(dbt, session): - - # DataFrame representing an upstream model - upstream_model = dbt.ref("upstream_model_name") - - # DataFrame representing an upstream source - upstream_source = dbt.source("upstream_source_name", "table_name") - - ... -``` - - - -Of course, you can `ref()` your Python model in downstream SQL models, too: - - - -```sql -with upstream_python_model as ( - - select * from {{ ref('my_python_model') }} - -), - -... -``` - - - -### Configuring Python models - -Just like SQL models, there are three ways to configure Python models: -1. In `dbt_project.yml`, where you can configure many models at once -2. In a dedicated `.yml` file, within the `models/` directory -3. Within the model's `.py` file, using the `dbt.config()` method - -Calling the `dbt.config()` method will set configurations for your model right within your `.py` file, similar to the `{{ config() }}` macro in `.sql` model files: - - - -```python -def model(dbt, session): - - # setting configuration - dbt.config(materialized="table") -``` - - - -There's a limit to how fancy you can get with the `dbt.config()` method. It accepts _only_ literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the [`config` property](resource-properties/config) in a YAML file. - -#### Accessing project context - -dbt Python models don't use Jinja to render compiled code. Python models have limited access to global project contexts compared to SQL models. That context is made available from the `dbt` class, passed in as an argument to the `model()` function. - -Out of the box, the `dbt` class supports: -- Returning DataFrames referencing the locations of other resources: `dbt.ref()` + `dbt.source()` -- Accessing the database location of the current model: `dbt.this()` (also: `dbt.this.database`, `.schema`, `.identifier`) -- Determining if the current model's run is incremental: `dbt.is_incremental` - -It is possible to extend this context by "getting" them via `dbt.config.get()` after they are configured in the [model's config](/reference/model-configs). This includes inputs such as `var`, `env_var`, and `target`. If you want to use those values to power conditional logic in your model, we require setting them through a dedicated `.yml` file config: - - - -```yml -version: 2 - -models: - - name: my_python_model - config: - materialized: table - target_name: "{{ target.name }}" - specific_var: "{{ var('SPECIFIC_VAR') }}" - specific_env_var: "{{ env_var('SPECIFIC_ENV_VAR') }}" -``` - - - -Then, within the model's Python code, use the `dbt.config.get()` function to _access_ values of configurations that have been set: - - - -```python -def model(dbt, session): - target_name = dbt.config.get("target_name") - specific_var = dbt.config.get("specific_var") - specific_env_var = dbt.config.get("specific_env_var") - - orders_df = dbt.ref("fct_orders") - - # limit data in dev - if target_name == "dev": - orders_df = orders_df.limit(500) -``` - - - -### Materializations - -Python models support two materializations: -- `table` -- `incremental` - -Incremental Python models support all the same [incremental strategies](/docs/build/incremental-models#about-incremental_strategy) as their SQL counterparts. The specific strategies supported depend on your adapter. - -Python models can't be materialized as `view` or `ephemeral`. Python isn't supported for non-model resource types (like tests and snapshots). - -For incremental models, like SQL models, you will need to filter incoming tables to only new rows of data: - - - -
- - - -```python -import snowflake.snowpark.functions as F - -def model(dbt, session): - dbt.config( - materialized = "incremental", - unique_key = "id", - ) - df = dbt.ref("upstream_table") - - if dbt.is_incremental: - - # only new rows compared to max in current table - max_from_this = f"select max(updated_at) from {dbt.this}" - df = df.filter(df.updated_at > session.sql(max_from_this).collect()[0][0]) - - # or only rows from the past 3 days - df = df.filter(df.updated_at >= F.dateadd("day", F.lit(-3), F.current_timestamp())) - - ... - - return df -``` - - - -
- -
- - - -```python -import pyspark.sql.functions as F - -def model(dbt, session): - dbt.config( - materialized = "incremental", - unique_key = "id", - ) - df = dbt.ref("upstream_table") - - if dbt.is_incremental: - - # only new rows compared to max in current table - max_from_this = f"select max(updated_at) from {dbt.this}" - df = df.filter(df.updated_at > session.sql(max_from_this).collect()[0][0]) - - # or only rows from the past 3 days - df = df.filter(df.updated_at >= F.date_add(F.current_timestamp(), F.lit(-3))) - - ... - - return df -``` - - - -
- -
- -**Note:** Incremental models are supported on BigQuery/Dataproc for the `merge` incremental strategy. The `insert_overwrite` strategy is not yet supported. - -## Python-specific functionality - -### Defining functions - -In addition to defining a `model` function, the Python model can import other functions or define its own. Here's an example, on Snowpark, defining a custom `add_one` function: - - - -```python -def add_one(x): - return x + 1 - -def model(dbt, session): - dbt.config(materialized="table") - temps_df = dbt.ref("temperatures") - - # warm things up just a little - df = temps_df.withColumn("degree_plus_one", add_one(temps_df["degree"])) - return df -``` - - - -At present, Python functions defined in one dbt model can't be imported and reused in other models. See the ["Code reuse"](#code-reuse) section for the potential patterns we're considering. - -### Using PyPI packages - -You can also define functions that depend on third-party packages, so long as those packages are installed and available to the Python runtime on your data platform. See notes on "Installing Packages" for [specific data warehouses](#specific-data-warehouses). - -In this example, we use the `holidays` package to determine if a given date is a holiday in France. For simplicity and consistency across platforms, the code below uses the pandas API. The exact syntax, and the need to refactor for multi-node processing, still varies. - - - -
- - - -```python -import holidays - -def is_holiday(date_col): - # Chez Jaffle - french_holidays = holidays.France() - is_holiday = (date_col in french_holidays) - return is_holiday - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["holidays"] - ) - - orders_df = dbt.ref("stg_orders") - - df = orders_df.to_pandas() - - # apply our function - # (columns need to be in uppercase on Snowpark) - df["IS_HOLIDAY"] = df["ORDER_DATE"].apply(is_holiday) - - # return final dataset (Pandas DataFrame) - return df -``` - - - -
- -
- - - -```python -import holidays - -def is_holiday(date_col): - # Chez Jaffle - french_holidays = holidays.France() - is_holiday = (date_col in french_holidays) - return is_holiday - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["holidays"] - ) - - orders_df = dbt.ref("stg_orders") - - df = orders_df.to_pandas_on_spark() # Spark 3.2+ - # df = orders_df.toPandas() in earlier versions - - # apply our function - df["is_holiday"] = df["order_date"].apply(is_holiday) - - # convert back to PySpark - df = df.to_spark() # Spark 3.2+ - # df = session.createDataFrame(df) in earlier versions - - # return final dataset (PySpark DataFrame) - return df -``` - - - -
- -
- -#### Configuring packages - -We encourage you to explicitly configure required packages and versions so dbt can track them in project metadata. This configuration is required for the implementation on some platforms. If you need specific versions of packages, specify them. - - - -```python -def model(dbt, session): - dbt.config( - packages = ["numpy==1.23.1", "scikit-learn"] - ) -``` - - - - - -```yml -version: 2 - -models: - - name: my_python_model - config: - packages: - - "numpy==1.23.1" - - scikit-learn -``` - - - -#### UDFs - -You can use the `@udf` decorator or `udf` function to define an "anonymous" function and call it within your `model` function's DataFrame transformation. This is a typical pattern for applying more complex functions as DataFrame operations, especially if those functions require inputs from third-party packages. -- [Snowpark Python: Creating UDFs](https://docs.snowflake.com/en/developer-guide/snowpark/python/creating-udfs.html) -- [PySpark functions: udf](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html) - - - -
- - - -```python -import snowflake.snowpark.types as T -import snowflake.snowpark.functions as F -import numpy - -def register_udf_add_random(): - add_random = F.udf( - # use 'lambda' syntax, for simple functional behavior - lambda x: x + numpy.random.normal(), - return_type=T.FloatType(), - input_types=[T.FloatType()] - ) - return add_random - -def model(dbt, session): - - dbt.config( - materialized = "table", - packages = ["numpy"] - ) - - temps_df = dbt.ref("temperatures") - - add_random = register_udf_add_random() - - # warm things up, who knows by how much - df = temps_df.withColumn("degree_plus_random", add_random("degree")) - return df -``` - - - -**Note:** Due to a Snowpark limitation, it is not currently possible to register complex named UDFs within stored procedures, and therefore dbt Python models. We are looking to add native support for Python UDFs as a project/DAG resource type in a future release. For the time being, if you want to create a "vectorized" Python UDF via the Batch API, we recommend either: -- Writing [`create function`](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html) inside a SQL macro, to run as a hook or run-operation -- [Registering from a staged file](https://docs.snowflake.com/ko/developer-guide/snowpark/reference/python/_autosummary/snowflake.snowpark.udf.html#snowflake.snowpark.udf.UDFRegistration.register_from_file) within your Python model code - -
- -
- - - -```python -from pyspark.sql.types as T -import pyspark.sql.functions as F -import numpy - -# use a 'decorator' for more readable code -@F.udf(returnType=T.DoubleType()) -def add_random(x): - random_number = numpy.random.normal() - return x + random_number - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["numpy"] - ) - - temps_df = dbt.ref("temperatures") - - # warm things up, who knows by how much - df = temps_df.withColumn("degree_plus_random", add_random("degree")) - return df -``` - - - -
- -
- -#### Code reuse - -Currently, you cannot import or reuse Python functions defined in one dbt model, in other models. This is something we'd like dbt to support. There are two patterns we're considering: -1. Creating and registering **"named" UDFs**. This process is different across data platforms and has some performance limitations. (Snowpark does support ["vectorized" UDFs](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html): pandas-like functions that you can execute in parallel.) -2. Using **private Python packages**. In addition to importing reusable functions from public PyPI packages, many data platforms support uploading custom Python assets and registering them as packages. The upload process looks different across platforms, but your code’s actual `import` looks the same. - -:::note ❓ Our questions - -- Should dbt have a role in abstracting over UDFs? Should dbt support a new type of DAG node, `function`? Would the primary use case be code reuse across Python models or defining Python-language functions that can be called from SQL models? -- How can dbt help users when uploading or initializing private Python assets? Is this a new form of `dbt deps`? -- How can dbt support users who want to test custom functions? If defined as UDFs: "unit testing" in the database? If "pure" functions in packages: encourage adoption of `pytest`? - -💬 Discussion: ["Python models: package, artifact/object storage, and UDF management in dbt"](https://github.com/dbt-labs/dbt-core/discussions/5741) -::: - -### DataFrame API and syntax - -Over the past decade, most people writing data transformations in Python have adopted DataFrame as their common abstraction. dbt follows this convention by returning `ref()` and `source()` as DataFrames, and it expects all Python models to return a DataFrame. - -A DataFrame is a two-dimensional data structure (rows and columns). It supports convenient methods for transforming that data, creating new columns from calculations performed on existing columns. It also offers convenient ways for previewing data while developing locally or in a notebook. - -That's about where the agreement ends. There are numerous frameworks with their own syntaxes and APIs for DataFrames. The [pandas](https://pandas.pydata.org/docs/) library offered one of the original DataFrame APIs, and its syntax is the most common to learn for new data professionals. Most newer DataFrame APIs are compatible with pandas-style syntax, though few can offer perfect interoperability. This is true for Snowpark and PySpark, which have their own DataFrame APIs. - -When developing a Python model, you will find yourself asking these questions: - -**Why pandas?** It's the most common API for DataFrames. It makes it easy to explore sampled data and develop transformations locally. You can “promote” your code as-is into dbt models and run it in production for small datasets. - -**Why _not_ pandas?** Performance. pandas runs "single-node" transformations, which cannot benefit from the parallelism and distributed computing offered by modern data warehouses. This quickly becomes a problem as you operate on larger datasets. Some data platforms support optimizations for code written using pandas' DataFrame API, preventing the need for major refactors. For example, ["pandas on PySpark"](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html) offers support for 95% of pandas functionality, using the same API while still leveraging parallel processing. - -:::note ❓ Our questions -- When developing a new dbt Python model, should we recommend pandas-style syntax for rapid iteration and then refactor? -- Which open source libraries provide compelling abstractions across different data engines and vendor-specific APIs? -- Should dbt attempt to play a longer-term role in standardizing across them? - -💬 Discussion: ["Python models: the pandas problem (and a possible solution)"](https://github.com/dbt-labs/dbt-core/discussions/5738) -::: - -### Limitations - -Python models have capabilities that SQL models do not. They also have some drawbacks compared to SQL models: - -- **Time and cost.** Python models are slower to run than SQL models, and the cloud resources that run them can be more expensive. Running Python requires more general-purpose compute. That compute might sometimes live on a separate service or architecture from your SQL models. **However:** We believe that deploying Python models via dbt—with unified lineage, testing, and documentation—is, from a human standpoint, **dramatically** faster and cheaper. By comparison, spinning up separate infrastructure to orchestrate Python transformations in production and different tooling to integrate with dbt is much more time-consuming and expensive. -- **Syntax differences** are even more pronounced. Over the years, dbt has done a lot, via dispatch patterns and packages such as `dbt_utils`, to abstract over differences in SQL dialects across popular data warehouses. Python offers a **much** wider field of play. If there are five ways to do something in SQL, there are 500 ways to write it in Python, all with varying performance and adherence to standards. Those options can be overwhelming. As the maintainers of dbt, we will be learning from state-of-the-art projects tackling this problem and sharing guidance as we develop it. -- **These capabilities are very new.** As data warehouses develop new features, we expect them to offer cheaper, faster, and more intuitive mechanisms for deploying Python transformations. **We reserve the right to change the underlying implementation for executing Python models in future releases.** Our commitment to you is around the code in your model `.py` files, following the documented capabilities and guidance we're providing here. - -As a general rule, if there's a transformation you could write equally well in SQL or Python, we believe that well-written SQL is preferable: it's more accessible to a greater number of colleagues, and it's easier to write code that's performant at scale. If there's a transformation you _can't_ write in SQL, or where ten lines of elegant and well-annotated Python could save you 1000 lines of hard-to-read Jinja-SQL, Python is the way to go. - -## Specific data platforms - -In their initial launch, Python models are supported on three of the most popular data platforms: Snowflake, Databricks, and BigQuery/GCP (via Dataproc). Both Databricks and GCP's Dataproc use PySpark as the processing framework. Snowflake uses its own framework, Snowpark, which has many similarities to PySpark. - - - -
- -**Additional setup:** You will need to [acknowledge and accept Snowflake Third Party Terms](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#getting-started) to use Anaconda packages. - -**Installing packages:** Snowpark supports several popular packages via Anaconda. The complete list is at https://repo.anaconda.com/pkgs/snowflake/. Packages are installed at the time your model is being run. Different models can have different package dependencies. If you are using third-party packages, Snowflake recommends using a dedicated virtual warehouse for best performance rather than one with many concurrent users. - -**About "sprocs":** dbt submits Python models to run as "stored procedures," which some people call "sprocs" for short. By default, dbt will create a named sproc containing your model's compiled Python code, and then "call" it to execute. Snowpark has a Private Preview feature for "temporary" or "anonymous" stored procedures ([docs](https://docs.snowflake.com/en/LIMITEDACCESS/call-with.html)), which are faster and leave a cleaner query history. If this feature is enabled for your account, you can switch it on for your models by configuring `use_anonymous_sproc: True`. We plan to switch this on for all dbt + Snowpark Python models in a future release. - - - -```yml -# I asked Snowflake Support to enable this Private Preview feature, -# and now my dbt-py models run even faster! -models: - use_anonymous_sproc: True -``` - - - -**Docs:** ["Developer Guide: Snowpark Python"](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html) - -
- -
- -**Submission methods:** Databricks supports a few different mechanisms to submit PySpark code, each with relative advantages. Some are better for supporting iterative development, while others are better for supporting lower-cost production deployments. The options are: -- `all_purpose_cluster` (default): dbt will run your Python model using the cluster ID configured as `cluster` in your connection profile or for this specific model. These clusters are more expensive but also much more responsive. We recommend using an interactive all-purpose cluster for quicker iteration in development. - - `create_notebook: True`: dbt will upload your model's compiled PySpark code to a notebook in the namespace `/Shared/dbt_python_model/{schema}`, where `{schema}` is the configured schema for the model, and execute that notebook to run using the all-purpose cluster. The appeal of this approach is that you can easily open the notebook in the Databricks UI for debugging or fine-tuning right after running your model. Remember to copy any changes into your dbt `.py` model code before re-running. - - `create_notebook: False` (default): dbt will use the [Command API](https://docs.databricks.com/dev-tools/api/1.2/index.html#run-a-command), which is slightly faster. -- `job_cluster`: dbt will upload your model's compiled PySpark code to a notebook in the namespace `/Shared/dbt_python_model/{schema}`, where `{schema}` is the configured schema for the model, and execute that notebook to run using a short-lived jobs cluster. For each Python model, Databricks will need to spin up the cluster, execute the model's PySpark transformation, and then spin down the cluster. As such, job clusters take longer before and after model execution, but they're also less expensive, so we recommend these for longer-running Python models in production. To use the `job_cluster` submission method, your model must be configured with `job_cluster_config`, which defines key-value properties for `new_cluster`, as defined in the [JobRunsSubmit API](https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit). - -You can configure each model's `submission_method` in all the standard ways you supply configuration: - -```python -def model(dbt, session): - dbt.config( - submission_method="all_purpose_cluster", - create_notebook=True, - cluster_id="abcd-1234-wxyz" - ) - ... -``` -```yml -version: 2 -models: - - name: my_python_model - config: - submission_method: job_cluster - job_cluster_config: - spark_version: ... - node_type_id: ... -``` -```yml -# dbt_project.yml -models: - project_name: - subfolder: - # set defaults for all .py models defined in this subfolder - +submission_method: all_purpose_cluster - +create_notebook: False - +cluster_id: abcd-1234-wxyz -``` - -If not configured, `dbt-spark` will use the built-in defaults: the all-purpose cluster (based on `cluster` in your connection profile) without creating a notebook. The `dbt-databricks` adapter will default to the cluster configured in `http_path`. We encourage explicitly configuring the clusters for Python models in Databricks projects. - -**Installing packages:** When using all-purpose clusters, we recommend installing packages which you will be using to run your Python models. - -**Docs:** -- [PySpark DataFrame syntax](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) -- [Databricks: Introduction to DataFrames - Python](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html) - -
- -
- -The `dbt-bigquery` adapter uses a service called Dataproc to submit your Python models as PySpark jobs. That Python/PySpark code will read from your tables and views in BigQuery, perform all computation in Dataproc, and write the final result back to BigQuery. - -**Submission methods.** Dataproc supports two submission methods: `serverless` and `cluster`. Dataproc Serverless does not require a ready cluster, which saves on hassle and cost—but it is slower to start up, and much more limited in terms of available configuration. For example, Dataproc Serverless supports only a small set of Python packages, though it does include `pandas`, `numpy`, and `scikit-learn`. (See the full list [here](https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build), under "The following packages are installed in the default image"). Whereas, by creating a Dataproc Cluster in advance, you can fine-tune the cluster's configuration, install any PyPI packages you want, and benefit from faster, more responsive runtimes. - -Use the `cluster` submission method with dedicated Dataproc clusters you or your organization manage. Use the `serverless` submission method to avoid managing a Spark cluster. The latter may be quicker for getting started, but both are valid for production. - -**Additional setup:** -- Create or use an existing [Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) -- Enable Dataproc APIs for your project + region -- If using the `cluster` submission method: Create or use an existing [Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) with the [Spark BigQuery connector initialization action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/connectors#bigquery-connectors). (Google recommends copying the action into your own Cloud Storage bucket, rather than using the example version shown in the screenshot below.) - - - -The following configurations are needed to run Python models on Dataproc. You can add these to your [BigQuery profile](/reference/warehouse-setups/bigquery-setup#running-python-models-on-dataproc), or configure them on specific Python models: -- `gcs_bucket`: Storage bucket to which dbt will upload your model's compiled PySpark code. -- `dataproc_region`: GCP region in which you have enabled Dataproc (for example `us-central1`) -- `dataproc_cluster_name`: Name of Dataproc cluster to use for running Python model (executing PySpark job). Only required if `submission_method: cluster`. - -```python -def model(dbt, session): - dbt.config( - submission_method="cluster", - dataproc_cluster_name="my-favorite-cluster" - ) - ... -``` -```yml -version: 2 -models: - - name: my_python_model - config: - submission_method: serverless -``` - -Any user or service account that runs dbt Python models will need the following permissions, in addition to permissions needed for BigQuery ([docs](https://cloud.google.com/dataproc/docs/concepts/iam/iam)): -``` -dataproc.clusters.use -dataproc.jobs.create -dataproc.jobs.get -dataproc.operations.get -storage.buckets.get -storage.objects.create -storage.objects.delete -``` - -**Installing packages:** If you are using a Dataproc Cluster (as opposed to Dataproc Serverless), you can add third-party packages while creating the cluster. - -Google recommends installing Python packages on Dataproc clusters via initialization actions: -- [How initialization actions are used](https://github.com/GoogleCloudDataproc/initialization-actions/blob/master/README.md#how-initialization-actions-are-used) -- [Actions for installing via `pip` or `conda`](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/python) - -You can also install packages at cluster creation time by [defining cluster properties](https://cloud.google.com/dataproc/docs/tutorials/python-configuration#image_version_20): `dataproc:pip.packages` or `dataproc:conda.packages`. - - - -**Docs:** -- [Dataproc overview](https://cloud.google.com/dataproc/docs/concepts/overview) -- [PySpark DataFrame syntax](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) - -
- -
- -
diff --git a/website/docs/docs/cloud/about-cloud-develop-defer.md b/website/docs/docs/cloud/about-cloud-develop-defer.md new file mode 100644 index 00000000000..37bfaacfd0c --- /dev/null +++ b/website/docs/docs/cloud/about-cloud-develop-defer.md @@ -0,0 +1,63 @@ +--- +title: Using defer in dbt Cloud +id: about-cloud-develop-defer +description: "Learn how to leverage defer to prod when developing with dbt Cloud." +sidebar_label: "Using defer in dbt Cloud" +pagination_next: "docs/cloud/cloud-cli-installation" +--- + + +[Defer](/reference/node-selection/defer) is a powerful feature that allows developers to only build and run and test models they've edited without having to first run and build all the models that come before them (upstream parents). dbt powers this by using a production manifest for comparison, and resolves the `{{ ref() }}` function with upstream production artifacts. + +Both the dbt Cloud IDE and the dbt Cloud CLI enable users to natively defer to production metadata directly in their development workflows. + +By default, dbt follows these rules: + +- dbt uses the production locations of parent models to resolve `{{ ref() }}` functions, based on metadata from the production environment. +- If a development version of a deferred model exists, dbt preferentially uses the development database location when resolving the reference. +- Passing the [`--favor-state`](/reference/node-selection/defer#favor-state) flag overrides the default behavior and _always_ resolve refs using production metadata, regardless of the presence of a development relation. + +For a clean slate, it's a good practice to drop the development schema at the start and end of your development cycle. + +## Required setup + +- You must select the **[Production environment](/docs/deploy/deploy-environments#set-as-production-environment)** checkbox in the **Environment Settings** page. + - This can be set for one deployment environment per dbt Cloud project. +- You must have a successful job run first. + +When using defer, it compares artifacts from the most recent successful production job, excluding CI jobs. + +### Defer in the dbt Cloud IDE + +To enable defer in the dbt Cloud IDE, toggle the **Defer to production** button on the command bar. Once enabled, dbt Cloud will: + +1. Pull down the most recent manifest from the Production environment for comparison +2. Pass the `--defer` flag to the command (for any command that accepts the flag) + +For example, if you were to start developing on a new branch with [nothing in your development schema](/reference/node-selection/defer#usage), edit a single model, and run `dbt build -s state:modified` — only the edited model would run. Any `{{ ref() }}` functions will point to the production location of the referenced models. + + + +### Defer in dbt Cloud CLI + +One key difference between using `--defer` in the dbt Cloud CLI and the dbt Cloud IDE is that `--defer` is *automatically* enabled in the dbt Cloud CLI for all invocations, compared with production artifacts. You can disable it with the `--no-defer` flag. + +The dbt Cloud CLI offers additional flexibility by letting you choose the source environment for deferral artifacts. You can set a `defer-env-id` key in either your `dbt_project.yml` or `dbt_cloud.yml` file. If you do not provide a `defer-env-id` setting, the dbt Cloud CLI will use artifacts from your dbt Cloud environment marked "Production". + + + + ```yml +defer-env-id: '123456' +``` + + + + + + +```yml +dbt_cloud: + defer-env-id: '123456' +``` + + diff --git a/website/docs/docs/cloud/about-cloud-develop.md b/website/docs/docs/cloud/about-cloud-develop.md new file mode 100644 index 00000000000..90abbb98bf4 --- /dev/null +++ b/website/docs/docs/cloud/about-cloud-develop.md @@ -0,0 +1,33 @@ +--- +title: About developing in dbt Cloud +id: about-cloud-develop +description: "Learn how to develop your dbt projects using dbt Cloud." +sidebar_label: "About developing in dbt Cloud" +pagination_next: "docs/cloud/cloud-cli-installation" +hide_table_of_contents: true +--- + +dbt Cloud offers a fast and reliable way to work on your dbt project. It runs dbt Core in a hosted (single or multi-tenant) environment. You can develop in your browser using an integrated development environment (IDE) or in a dbt Cloud-powered command line interface (CLI): + +
+ + + + + +

+ +The following sections provide detailed instructions on setting up the dbt Cloud CLI and dbt Cloud IDE. To get started with dbt development, you'll need a [developer](/docs/cloud/manage-access/seats-and-users) account. For a more comprehensive guide about developing in dbt, refer to our [quickstart guides](/guides). + + +--------- +**Note**: The dbt Cloud CLI and the open-sourced dbt Core are both command line tools that let you run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). + diff --git a/website/docs/docs/cloud/about-cloud-setup.md b/website/docs/docs/cloud/about-cloud-setup.md index baa2465472e..5c8e5525bf1 100644 --- a/website/docs/docs/cloud/about-cloud-setup.md +++ b/website/docs/docs/cloud/about-cloud-setup.md @@ -3,6 +3,8 @@ title: About dbt Cloud setup id: about-cloud-setup description: "Configuration settings for dbt Cloud." sidebar_label: "About dbt Cloud setup" +pagination_next: "docs/dbt-cloud-environments" +pagination_prev: null --- dbt Cloud is the fastest and most reliable way to deploy your dbt jobs. It contains a myriad of settings that can be configured by admins, from the necessities (data platform integration) to security enhancements (SSO) and quality-of-life features (RBAC). This portion of our documentation will take you through the various settings found by clicking on the gear icon in the dbt Cloud UI, including: @@ -11,8 +13,10 @@ dbt Cloud is the fastest and most reliable way to deploy your dbt jobs. It conta - Configuring access to [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or your own [git repo URL](/docs/cloud/git/import-a-project-by-git-url). - [Managing users and licenses](/docs/cloud/manage-access/seats-and-users) - [Configuring secure access](/docs/cloud/manage-access/about-user-access) +- Configuring the [dbt Cloud IDE](/docs/cloud/about-cloud-develop) +- Installing and configuring the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) -These settings are intended for dbt Cloud administrators. If you need a more detailed first-time setup guide for specific data platforms, read our [quickstart guides](/quickstarts). +These settings are intended for dbt Cloud administrators. If you need a more detailed first-time setup guide for specific data platforms, read our [quickstart guides](/guides). If you want a more in-depth learning experience, we recommend taking the dbt Fundamentals on our [dbt Learn online courses site](https://courses.getdbt.com/). diff --git a/website/docs/docs/cloud/about-cloud/about-cloud-ide.md b/website/docs/docs/cloud/about-cloud/about-cloud-ide.md index 923212c1260..7643928feec 100644 --- a/website/docs/docs/cloud/about-cloud/about-cloud-ide.md +++ b/website/docs/docs/cloud/about-cloud/about-cloud-ide.md @@ -5,7 +5,7 @@ description: "about dbt Cloud Integrated Development Environment" sidebar_label: About dbt Cloud IDE --- -The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. The IDE leverages the open-source [dbt-rpc](/reference/commands/rpc) plugin to recompile only the changes made in your project. +The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. With the Cloud IDE, you can: @@ -25,7 +25,7 @@ With the Cloud IDE, you can: For more information, read the complete [Cloud IDE guide](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). -## Relatd docs +## Related docs - [IDE user interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) - [Tips and tricks](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips) diff --git a/website/docs/docs/cloud/about-cloud/dbt-cloud-features.md b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md similarity index 73% rename from website/docs/docs/cloud/about-cloud/dbt-cloud-features.md rename to website/docs/docs/cloud/about-cloud/about-dbt-cloud.md index f301dfce34b..518efe56a8b 100644 --- a/website/docs/docs/cloud/about-cloud/dbt-cloud-features.md +++ b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md @@ -4,86 +4,101 @@ id: "dbt-cloud-features" sidebar_label: "dbt Cloud features" description: "Explore dbt Cloud's features and learn why dbt Cloud is the fastest way to deploy dbt" hide_table_of_contents: true +pagination_next: "docs/cloud/about-cloud/architecture" +pagination_prev: null --- -dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, document, and investigate data models all in one browser-based UI. In addition to providing a hosted architecture for running dbt across your organization, dbt Cloud comes equipped with turnkey support for scheduling jobs, CI/CD, hosting documentation, monitoring & alerting, and an integrated development environment (IDE). +dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, document, and investigate data models all in one browser-based UI. + +In addition to providing a hosted architecture for running dbt across your organization, dbt Cloud comes equipped with turnkey support for scheduling jobs, CI/CD, hosting documentation, monitoring and alerting, an integrated development environment (IDE), and allows you to develop and run dbt commands from your local command line interface (CLI) or code editor. dbt Cloud's [flexible plans](https://www.getdbt.com/pricing/) and features make it well-suited for data teams of any size — sign up for your [free 14-day trial](https://www.getdbt.com/signup/)!
+ + + link="/docs/cloud/dbt-cloud-ide/develop-in-the-cloud" + icon="dbt-bit"/> + icon="dbt-bit"/> + icon="dbt-bit"/> + icon="dbt-bit"/> + + + icon="dbt-bit"/> + icon="dbt-bit"/> + icon="dbt-bit"/> + icon="dbt-bit"/> + link="/docs/use-dbt-semantic-layer/dbt-sl" + icon="dbt-bit"/> - + icon="dbt-bit"/> - +

*These features are available on [selected plans](https://www.getdbt.com/pricing/). ## Related docs - [dbt Cloud plans and pricing](https://www.getdbt.com/pricing/) -- [Quickstart guides](/quickstarts) +- [Quickstart guides](/guides) - [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) diff --git a/website/docs/docs/cloud/about-cloud/architecture.md b/website/docs/docs/cloud/about-cloud/architecture.md index 4ad016f4007..52614f0cbcd 100644 --- a/website/docs/docs/cloud/about-cloud/architecture.md +++ b/website/docs/docs/cloud/about-cloud/architecture.md @@ -42,7 +42,7 @@ Some data warehouse providers offer advanced security features that can be lever ### Git sync -dbt Cloud can sync with a variety of git providers, including [Github](/docs/cloud/git/connect-github), [Gitlab](/docs/cloud/git/connect-gitlab), and [Azure DevOps](/docs/cloud/git/connect-azure-devops) within its integrated development environment ([IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). Communication takes place over HTTPS rather than SSH and is protected using the TLS 1.2 protocol for data in transit. +dbt Cloud can sync with a variety of git providers, including [Github](/docs/cloud/git/connect-github), [Gitlab](/docs/cloud/git/connect-gitlab), and [Azure DevOps](/docs/cloud/git/connect-azure-devops) within its integrated development environment ([IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud)). Communication takes place over HTTPS rather than SSH and is protected using the TLS 1.2 protocol for data in transit. The git repo information is stored on dbt Cloud servers to make it accessible during the IDE sessions. When the git sync is disabled, you must [contact support](mailto:support@getdbt.com) to request the deletion of the synced data. diff --git a/website/docs/docs/cloud/about-cloud/browsers.md b/website/docs/docs/cloud/about-cloud/browsers.md index 2fc5a8b4b4d..12665bc7b72 100644 --- a/website/docs/docs/cloud/about-cloud/browsers.md +++ b/website/docs/docs/cloud/about-cloud/browsers.md @@ -2,6 +2,7 @@ title: "Supported browsers" id: "browsers" description: "dbt Cloud supports the latest browsers like Chrome and Firefox." +pagination_next: null --- To have the best experience with dbt Cloud, we recommend using the latest versions of the following browsers: diff --git a/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md b/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md index bc8c180f2fd..cc1c2531f56 100644 --- a/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md +++ b/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md @@ -11,10 +11,27 @@ dbt Cloud is [hosted](/docs/cloud/about-cloud/architecture) in multiple regions | Region | Location | Access URL | IP addresses | Developer plan | Team plan | Enterprise plan | |--------|----------|------------|--------------|----------------|-----------|-----------------| -| North America [^1] | AWS us-east-1 (N. Virginia) | cloud.getdbt.com | 52.45.144.63
54.81.134.249
52.22.161.231 | ✅ | ✅ | ✅ | +| North America multi-tenant [^1] | AWS us-east-1 (N. Virginia) | cloud.getdbt.com | 52.45.144.63
54.81.134.249
52.22.161.231 | ✅ | ✅ | ✅ | +| North America Cell 1 [^1] | AWS us-east-1 (N.Virginia) | {account prefix}.us1.dbt.com | [Located in Account Settings](#locating-your-dbt-cloud-ip-addresses) | ❌ | ❌ | ✅ | | EMEA [^1] | AWS eu-central-1 (Frankfurt) | emea.dbt.com | 3.123.45.39
3.126.140.248
3.72.153.148 | ❌ | ❌ | ✅ | | APAC [^1] | AWS ap-southeast-2 (Sydney)| au.dbt.com | 52.65.89.235
3.106.40.33
13.239.155.206
| ❌ | ❌ | ✅ | | Virtual Private dbt or Single tenant | Customized | Customized | Ask [Support](/community/resources/getting-help#dbt-cloud-support) for your IPs | ❌ | ❌ | ✅ | [^1]: These regions support [multi-tenant](/docs/cloud/about-cloud/tenancy) deployment environments hosted by dbt Labs. + +### Locating your dbt Cloud IP addresses + +There are two ways to view your dbt Cloud IP addresses: +- If no projects exist in the account, create a new project, and the IP addresses will be displayed during the **Configure your environment** steps. +- If you have an existing project, navigate to **Account Settings** and ensure you are in the **Projects** pane. Click on a project name, and the **Project Settings** window will open. Locate the **Connection** field and click on the name. Scroll down to the **Settings**, and the first text block lists your IP addresses. + +### Static IP addresses + +dbt Cloud, like many cloud services, relies on underlying AWS cloud infrastructure for operations. While we can offer static URLs for access, we cannot provide a list of IP addresses to configure connections due to the nature of AWS cloud services. + +* Dynamic IP addresses — dbt Cloud infrastructure uses Amazon Web Services (AWS). dbt Cloud offers static URLs for streamlined access, but the dynamic nature of cloud services means the underlying IP addresses change occasionally. AWS manages the IP ranges and may change them according to their operational and security needs. + +* Using hostnames for consistent access — To ensure uninterrupted access, we recommend that you dbt Cloud services using hostnames. Hostnames provide a consistent reference point, regardless of any changes in underlying IP addresses. We are aligning with an industry-standard practice employed by organizations such as Snowflake. + +* Optimizing VPN connections — You should integrate a proxy alongside VPN for users who leverage VPN connections. This strategy enables steady IP addresses for your connections, facilitating smooth traffic flow through the VPN and onward to dbt Cloud. By employing a proxy and a VPN, you can direct traffic through the VPN and then to dbt Cloud. It's crucial to set up the proxy if you need to integrate with additional services. diff --git a/website/docs/docs/cloud/billing.md b/website/docs/docs/cloud/billing.md new file mode 100644 index 00000000000..31b7689ceb9 --- /dev/null +++ b/website/docs/docs/cloud/billing.md @@ -0,0 +1,260 @@ +--- +title: "Billing" +id: billing +description: "dbt Cloud billing information." +sidebar_label: Billing +pagination_next: null +pagination_prev: null +--- + +dbt Cloud offers a variety of [plans and pricing](https://www.getdbt.com/pricing/) to fit your organization’s needs. With flexible billing options that appeal to large enterprises and small businesses and [server availability](/docs/cloud/about-cloud/regions-ip-addresses) worldwide, dbt Cloud is the fastest and easiest way to begin transforming your data. + +## How does dbt Cloud pricing work? + +As a customer, you pay for the number of seats you have and the amount of usage consumed each month. Seats are billed primarily on the amount of Developer and Read licenses purchased. + +Usage is based on the number of [Successful Models Built](#what-counts-as-a-successful-model-built) and, if purchased and used, Semantic Layer [Queried Metrics](#what-counts-as-a-queried-metric) subject to reasonable usage. All billing computations are conducted in Coordinated Universal Time (UTC). + +### What counts as a seat license? + +There are three types of possible seat licenses: + +* **Developer** — for roles and permissions that require interaction with the dbt Cloud environment day-to-day. +* **Read-Only** — for access to view certain documents and reports. +* **IT** — for access to specific features related to account management (for example, configuring git integration). + +### What counts as a Successful Model Built? + +dbt Cloud considers a Successful Model Built as any model that is successfully built via a run through dbt Cloud’s orchestration functionality in a dbt Cloud deployment environment. Models are counted when built and run. This includes any jobs run via dbt Cloud's scheduler, CI builds (jobs triggered by pull requests), runs kicked off via the dbt Cloud API, and any successor dbt Cloud tools with similar functionality. This also includes models that are successfully built even when a run may fail to complete. For example, you may have a job that contains 100 models and on one of its runs, 51 models are successfully built and then the job fails. In this situation, only 51 models would be counted. + +Any models built in a dbt Cloud development environment (for example, via the IDE) do not count towards your usage. Tests, seeds, ephemeral models, and snapshots also do not count. + +| What counts towards Successful Models Built | | +|---------------------------------------------|---------------------| +| View | ✅ | +| Table | ✅ | +| Incremental | ✅ | +| Ephemeral Models | ❌ | +| Tests | ❌ | +| Seeds | ❌ | +| Snapshots | ❌ | + +### What counts as a Queried Metric? + +The dbt Semantic Layer, powered by MetricFlow, measures usage in distinct Queried Metrics. + +- Every successful request you make to render or run SQL to the Semantic Layer API counts as at least one queried metric, even if no data is returned. +- If the query calculates or renders SQL for multiple metrics, each calculated metric will be counted as a queried metric. +- If a request to run a query is not executed successfully in the data platform or if a query results in an error without completion, it is not counted as a queried metric. +- Requests for metadata from the Semantic Layer are also not counted as queried metrics. + +Examples of queried metrics include: + +- Querying one metric, grouping by one dimension → 1 queried metric + + ```shell + dbt sl query --metrics revenue --group_by metric_time + ``` + +- Querying one metric, grouping by two dimensions → 1 queried metric + + ```shell + dbt sl query --metrics revenue --group_by metric_time,user__country + ``` + +- Querying two metrics, grouping by two dimensions → 2 queried metrics + + ```shell + dbt sl query --metrics revenue,gross_sales --group_by metric_time,user__country + ``` + +- Running an explain for one metric → 1 queried metric + + ```shell + dbt sl query --metrics revenue --group_by metric_time --explain + ``` + +- Running an explain for two metrics → 2 queried metrics + + ```shell + dbt sl query --metrics revenue,gross_sales --group_by metric_time --explain + ``` + +### Viewing usage in the product + +Viewing usage in the product is restricted to specific roles: + +* Team plan — Owner group +* Enterprise plan — Account and billing admin roles + +For an account-level view of usage, if you have access to the **Billing** and **Usage** pages, you can see an estimate of the usage for the month. In the Billing page of the **Account Settings**, you can see how your account tracks against its usage. You can also see which projects are building the most models. + +As a Team and Developer plan user, you can see how the account is tracking against the included models built. As an Enterprise plan user, you can see how much you have drawn down from your annual commit and how much remains. + +On each Project Home page, any user with access to that project can see how many models are built each month. From there, additional details on top jobs by models built can be found on each Environment page. + +In addition, you can look at the Job Details page's Insights tab to show how many models are being built per month for that particular job and which models are taking the longest to build. + +Usage information is available to customers on consumption-based plans, and some usage visualizations might not be visible to customers on legacy plans. Any usage data shown in dbt Cloud is only an estimate of your usage, and there could be a delay in showing usage data in the product. Your final usage for the month will be visible on your monthly statements (statements applicable to Team and Enterprise plans). + + +## Plans and Billing + +dbt Cloud offers several [plans](https://www.getdbt.com/pricing) with different features that meet your needs. We may make changes to our plan details from time to time. We'll always let you know in advance, so you can be prepared. The following section explains how billing works in each plan. + +### Developer plan billing + +Developer plans are free and include one Developer license and 3,000 models each month. Models are refreshed at the beginning of each calendar month. If you exceed 3,000 models, any subsequent runs will be canceled until models are refreshed or until you upgrade to a paid plan. The rest of the dbt Cloud platform is still accessible, and no work will be lost. + +All included successful models built numbers above reflect our most current pricing and packaging. Based on your usage terms when you signed up for the Developer Plan, the included model entitlements may be different from what’s reflected above. + + +### Team plan billing + +Team customers pay monthly via credit card for seats and usage, and accounts include 15,000 models monthly. Seats are charged upfront at the beginning of the month. If you add seats during the month, seats will be prorated and charged on the same day. Seats removed during the month will be reflected on the next invoice and are not eligible for refunds. You can change the credit card information and the number of seats from the billings section anytime. Accounts will receive one monthly invoice that includes the upfront charge for the seats and the usage charged in arrears from the previous month. + +Usage is calculated and charged in arrears for the previous month. If you exceed 15,000 models in any month, you will be billed for additional usage on your next invoice. Additional usage is billed at the rates on our [pricing page](https://www.getdbt.com/pricing). + + +Included models that are not consumed do not roll over to future months. You can estimate your bill with a simple formula: + +`($100 x number of developer seats) + ((models built - 15,000) x $0.01)` + +All included successful models built numbers above reflect our most current pricing and packaging. Based on your usage terms when you signed up for the Team Plan, the included model entitlements may be different from what’s reflected above. + +### Enterprise plan billing + +As an Enterprise customer, you pay annually via invoice, monthly in arrears for additional usage (if applicable), and may benefit from negotiated usage rates. Please refer to your order form or contract for your specific pricing details, or [contact the account team](https://www.getdbt.com/contact-demo) with any questions. + +### Legacy plans + +Customers who purchased the dbt Cloud Team plan before August 11, 2023, remain on a legacy pricing plan as long as your account is in good standing. The legacy pricing plan is based on seats and includes unlimited models, subject to reasonable use. + +:::note Legacy Semantic Layer + +For customers using the legacy Semantic Layer with dbt_metrics package, this product will be deprecated in December 2023. Legacy users may choose to upgrade at any time to the revamped version, Semantic Layer powered by MetricFlow. The revamped version is available to most customers (see [prerequisites](/docs/use-dbt-semantic-layer/quickstart-sl#prerequisites)) for a limited time on a free trial basis, subject to reasonable use. + +::: + +dbt Labs may institute use limits if reasonable use is exceeded. Additional features, upgrades, or updates may be subject to separate charges. Any changes to your current plan pricing will be communicated in advance according to our Terms of Use. + + +## Managing usage + +From anywhere in the dbt Cloud account, click the **gear icon** and click **Account settings**. The **Billing** option will be on the left side menu under the **Account Settings** heading. Here, you can view individual available plans and the features provided for each. + +### Usage notifications + +Every plan automatically sends email alerts when 75%, 90%, and 100% of usage estimates have been reached. In the Team plan, all users within the Owner group will receive alerts. In Enterprise plans, all users with the Account Admin and Billing Admin permission sets will receive alerts. Users cannot opt out of these emails. If you would like additional users to receive these alert emails, please provide them with the applicable permissions mentioned above. Note that your usage may already be higher than the percentage indicated in the alert due to your usage pattern and minor latency times. + +### How do I stop usage from accruing? + +There are 2 options to disable models from being built and charged: + +1. Open the **Job Settings** of every job and navigate to the **Triggers** section. Disable the **Run on Schedule** and set the **Continuous Integration** feature **Run on Pull Requests?** to **No**. Check your workflows to ensure that you are not triggering any runs via the dbt Cloud API. This option will enable you to keep your dbt Cloud jobs without building more models. +2. Alternatively, you can delete some or all of your dbt Cloud jobs. This will ensure that no runs are kicked off, but you will permanently lose your job(s). + + +## Optimize costs in dbt Cloud + +dbt Cloud offers ways to optimize your model’s built usage and warehouse costs. + +### Best practices for optimizing successful models built + +When thinking of ways to optimize your costs from successful models built, there are methods to reduce those costs while still adhering to best practices. To ensure that you are still utilizing tests and rebuilding views when logic is changed, it's recommended to implement a combination of the best practices that fit your needs. More specifically, if you decide to exclude views from your regularly scheduled dbt Cloud job runs, it's imperative that you set up a merge job (with a link to the section) to deploy updated view logic when changes are detected. + +#### Exclude views in a dbt Cloud job + +Many dbt Cloud users utilize views, which don’t always need to be rebuilt every time you run a job. For any jobs that contain views that _do not_ include macros that dynamically generate code (for example, case statements) based on upstream tables and also _do not_ have tests, you can implement these steps: + +1. Go to your current production deployment job in dbt Cloud. +2. Modify your command to include: `-exclude config.materialized:view`. +3. Save your job changes. + +If you have views that contain macros with case statements based on upstream tables, these will need to be run each time to account for new values. If you still need to test your views with each run, follow the [Exclude views while still running tests](#exclude-views-while-running-tests) best practice to create a custom selector. + +#### Exclude views while running tests + +Running tests for views in every job run can help keep data quality intact and save you from the need to rerun failed jobs. To exclude views from your job run while running tests, you can follow these steps to create a custom [selector](https://docs.getdbt.com/reference/node-selection/yaml-selectors) for your job command. + +1. Open your dbt project in the dbt Cloud IDE. +2. Add a file called `selectors.yml` in your top-level project folder. +3. In the file, add the following code: + + ```yaml + selectors: + - name: skip_views_but_test_views + description: > + A default selector that will exclude materializing views + without skipping tests on views. + default: true + definition: + union: + - union: + - method: path + value: "*" + - exclude: + - method: config.materialized + value: view + - method: resource_type + value: test + + ``` + +4. Save the file and commit it to your project. +5. Modify your dbt Cloud jobs to include `--selector skip_views_but_test_views`. + +#### Build only changed views + +If you want to ensure that you're building views whenever the logic is changed, create a merge job that gets triggered when code is merged into main: + +1. Ensure you have a [CI job setup](/docs/deploy/ci-jobs) in your environment. +2. Create a new [deploy job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) and call it “Merge Job". +3. Set the  **Environment** to your CI environment. Refer to [Types of environments](/docs/deploy/deploy-environments#types-of-environments) for more details. +4. Set **Commands** to: `dbt run -s state:modified+`. + Executing `dbt build` in this context is unnecessary because the CI job was used to both run and test the code that just got merged into main. +5. Under the **Execution Settings**, select the default production job to compare changes against: + - **Defer to a previous run state** — Select the “Merge Job” you created so the job compares and identifies what has changed since the last merge. +6. In your dbt project, follow the steps in Run a dbt Cloud job on merge in the [Customizing CI/CD with custom pipelines](/guides/custom-cicd-pipelines) guide to create a script to trigger the dbt Cloud API to run your job after a merge happens within your git repository or watch this [video](https://www.loom.com/share/e7035c61dbed47d2b9b36b5effd5ee78?sid=bcf4dd2e-b249-4e5d-b173-8ca204d9becb). + +The purpose of the merge job is to: + +- Immediately deploy any changes from PRs to production. +- Ensure your production views remain up-to-date with how they’re defined in your codebase while remaining cost-efficient when running jobs in production. + +The merge action will optimize your cloud data platform spend and shorten job times, but you’ll need to decide if making the change is right for your dbt project. + +### Rework inefficient models + +#### Job Insights tab + +To reduce your warehouse spend, you can identify what models, on average, are taking the longest to build in the **Job** page under the **Insights** tab. This chart looks at the average run time for each model based on its last 20 runs. Any models that are taking longer than anticipated to build might be prime candidates for optimization, which will ultimately reduce cloud warehouse spending. + +#### Model Timing tab + +To understand better how long each model takes to run within the context of a specific run, you can look at the **Model Timing** tab. Select the run of interest on the **Run History** page to find the tab. On that **Run** page, click **Model Timing**. + +Once you've identified which models could be optimized, check out these other resources that walk through how to optimize your work: +* [Build scalable and trustworthy data pipelines with dbt and BigQuery](https://services.google.com/fh/files/misc/dbt_bigquery_whitepaper.pdf) +* [Best Practices for Optimizing Your dbt and Snowflake Deployment](https://www.snowflake.com/wp-content/uploads/2021/10/Best-Practices-for-Optimizing-Your-dbt-and-Snowflake-Deployment.pdf) +* [How to optimize and troubleshoot dbt models on Databricks](/guides/optimize-dbt-models-on-databricks) + +## FAQs + +* What happens if I need more than 8 seats on the Team plan? +_If you need more than 8 developer seats, select the Contact Sales option from the billing settings to talk to our sales team about an Enterprise plan._ + +* What if I go significantly over my included free models on the Team or Developer plan? +_Consider upgrading to a Team or Enterprise plan. Team plans include more models and allow you to exceed the monthly usage limit. Enterprise accounts are supported by a dedicated account management team and offer annual plans, custom configurations, and negotiated usage rates._ + +* I want to upgrade my plan. Will all of my work carry over? +_Yes. Your dbt Cloud account will be upgraded without impacting your existing projects and account settings._ + +* How do I determine the right plan for me? + _The best option is to consult with our sales team. They'll help you figure out what is right for your needs. We also offer a free two-week trial on the Team plan._ + +* What are the Semantic Layer trial terms? +_Team and Enterprise customers can sign up for a free trial of the dbt Semantic Layer, powered by MetricFlow, for use of up to 1,000 Queried Metrics per month. The trial will be available at least through January 2024. dbt Labs may extend the trial period in its sole discretion. During the trial period, we may reach out to discuss pricing options or ask for feedback. At the end of the trial, free access may be removed and a purchase may be required to continue use. dbt Labs reserves the right to change limits in a free trial or institute pricing when required or at any time in its sole discretion._ + +* What is the reasonable use limitation for the dbt Semantic Layer powered by MetricFlow during the trial? +_Each account will be limited to 1,000 Queried Metrics per month during the trial period and may be changed at the sole discretion of dbt Labs._ diff --git a/website/docs/docs/cloud/cloud-cli-installation.md b/website/docs/docs/cloud/cloud-cli-installation.md new file mode 100644 index 00000000000..b945bede160 --- /dev/null +++ b/website/docs/docs/cloud/cloud-cli-installation.md @@ -0,0 +1,305 @@ +--- +title: Install dbt Cloud CLI +sidebar_label: "Install dbt Cloud CLI" +id: cloud-cli-installation +description: "Instructions for installing and configuring dbt Cloud CLI" +pagination_next: "docs/cloud/configure-cloud-cli" +--- + +import CloudCLIFlag from '/snippets/_cloud-cli-flag.md'; + + + + +dbt Cloud natively supports developing using a command line (CLI), empowering team members to contribute with enhanced flexibility and collaboration. The dbt Cloud CLI allows you to run dbt commands against your dbt Cloud development environment from your local command line. + +dbt commands are run against dbt Cloud's infrastructure and benefit from: + +* Secure credential storage in the dbt Cloud platform. +* [Automatic deferral](/docs/cloud/about-cloud-develop-defer) of build artifacts to your Cloud project's production environment. +* Speedier, lower-cost builds. +* Support for dbt Mesh ([cross-project `ref`](/docs/collaborate/govern/project-dependencies)), +* Significant platform improvements, to be released over the coming months. + + +## Prerequisites +The dbt Cloud CLI is available in all [deployment regions](/docs/cloud/about-cloud/regions-ip-addresses) and for both multi-tenant and single-tenant accounts (Azure single-tenant not supported at this time). + +- Ensure you are using dbt version 1.5 or higher. Refer to [dbt Cloud versions](/docs/dbt-versions/upgrade-core-in-cloud) to upgrade. +- Note that SSH tunneling for [Postgres and Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) connections and [Single sign-on (SSO)](/docs/cloud/manage-access/sso-overview) doesn't support the dbt Cloud CLI yet. + +## Install dbt Cloud CLI + +You can install the dbt Cloud CLI on the command line by using one of these methods. + +
+View a video tutorial for a step-by-step guide to installation. + + + +
+ + + + + +Before you begin, make sure you have [Homebrew installed](http://brew.sh/) in your code editor or command line terminal. Refer to the [FAQs](#faqs) if your operating system runs into path conflicts. + +1. Verify that you don't already have dbt Core installed: + + ```bash + which dbt + ``` + - If you see a `dbt not found`, you're good to go. If the dbt help text appears, use `pip uninstall dbt` to remove dbt Core from your system.
+ +2. Install the dbt Cloud CLI with Homebrew: + + - First, remove the dbt-labs tap, the separate repository for packages, from Homebrew. This prevents Homebrew from installing packages from that repository: + ```bash + brew untap dbt-labs/dbt + - Then, add and install the dbt Cloud CLI as a package: + ```bash + brew tap dbt-labs/dbt-cli + brew install dbt + ``` + If you have multiple taps, use `brew install dbt-labs/dbt-cli/dbt`. + +3. Verify your installation by running `dbt --help` in the command line. If you see the following output, your installation is correct: + ```bash + The dbt Cloud CLI - an ELT tool for running SQL transformations and data models in dbt Cloud... + ``` + + If you don't see this output, check that you've deactivated pyenv or venv and don't have a global dbt version installed. + + * Note that you no longer need to run the `dbt deps` command when your environment starts. This step was previously required during initialization. However, you should still run `dbt deps` if you make any changes to your `packages.yml` file. + +4. Clone your repository to your local computer using `git clone`. For example, to clone a GitHub repo using HTTPS format, run `git clone https://github.com/YOUR-USERNAME/YOUR-REPOSITORY`. + +5. After cloning your repo, [configure](/docs/cloud/configure-cloud-cli) the dbt Cloud CLI for your dbt Cloud project. This lets you run dbt commands like `dbt compile` to compile your project and validate models and tests. You can also add, edit, and synchronize files with your repo. + + +
+ + + +Refer to the [FAQs](#faqs) if your operating system runs into path conflicts. + +1. Download the latest Windows release for your platform from [GitHub](https://github.com/dbt-labs/dbt-cli/releases). + +2. Extract the `dbt.exe` executable into the same folder as your dbt project. + +:::info + +Advanced users can configure multiple projects to use the same dbt Cloud CLI by placing the executable in the Program Files folder and [adding it to their Windows PATH environment variable](https://medium.com/@kevinmarkvi/how-to-add-executables-to-your-path-in-windows-5ffa4ce61a53). + +Note that if you are using VS Code, you must restart it to pick up modified environment variables. +::: + +3. Verify your installation by running `./dbt --help` in the command line. If you see the following output, your installation is correct: + ```bash + The dbt Cloud CLI - an ELT tool for running SQL transformations and data models in dbt Cloud... + ``` + + If you don't see this output, check that you've deactivated pyenv or venv and don't have a global dbt version installed. + + * Note that you no longer need to run the `dbt deps` command when your environment starts. This step was previously required during initialization. However, you should still run `dbt deps` if you make any changes to your `packages.yml` file. + +4. Clone your repository to your local computer using `git clone`. For example, to clone a GitHub repo using HTTPS format, run `git clone https://github.com/YOUR-USERNAME/YOUR-REPOSITORY`. + +5. After cloning your repo, [configure](/docs/cloud/configure-cloud-cli) the dbt Cloud CLI for your dbt Cloud project. This lets you run dbt commands like `dbt compile` to compile your project and validate models and tests. You can also add, edit, and synchronize files with your repo. + + + + + +Refer to the [FAQs](#faqs) if your operating system runs into path conflicts. + +1. Download the latest Linux release for your platform from [GitHub](https://github.com/dbt-labs/dbt-cli/releases). (Pick the file based on your CPU architecture) + +2. Extract the `dbt-cloud-cli` binary to the same folder as your dbt project. + + ```bash + tar -xf dbt_0.29.9_linux_amd64.tar.gz + ./dbt --version + ``` + +:::info + +Advanced users can configure multiple projects to use the same Cloud CLI executable by adding it to their PATH environment variable in their shell profile. + +::: + +3. Verify your installation by running `./dbt --help` in the command line. If you see the following output, your installation is correct: + ```bash + The dbt Cloud CLI - an ELT tool for running SQL transformations and data models in dbt Cloud... + ``` + + If you don't see this output, check that you've deactivated pyenv or venv and don't have a global dbt version installed. + + * Note that you no longer need to run the `dbt deps` command when your environment starts. This step was previously required during initialization. However, you should still run `dbt deps` if you make any changes to your `packages.yml` file. + +4. Clone your repository to your local computer using `git clone`. For example, to clone a GitHub repo using HTTPS format, run `git clone https://github.com/YOUR-USERNAME/YOUR-REPOSITORY`. + +5. After cloning your repo, [configure](/docs/cloud/configure-cloud-cli) the dbt Cloud CLI for your dbt Cloud project. This lets you run dbt commands like `dbt compile` to compile your project and validate models and tests. You can also add, edit, and synchronize files with your repo. + + + + + +If you already have dbt Core installed, the dbt Cloud CLI may conflict. Here are some considerations: + +- **Prevent conflicts**
Use both the dbt Cloud CLI and dbt Core with `pip` and create a new virtual environment.

+- **Use both dbt Cloud CLI and dbt Core with brew or native installs**
If you use Homebrew, consider aliasing the dbt Cloud CLI as "dbt-cloud" to avoid conflict. For more details, check the [FAQs](#faqs) if your operating system experiences path conflicts.

+- **Reverting back to dbt Core from the dbt Cloud CLI**
+ If you've already installed the dbt Cloud CLI and need to switch back to dbt Core:
+ - Uninstall the dbt Cloud CLI using the command: `pip uninstall dbt` + - Reinstall dbt Core using the following command, replacing "adapter_name" with the appropriate adapter name: + ```shell + pip install dbt-adapter_name --force-reinstall + ``` + For example, if I used Snowflake as an adapter, I would run: `pip install dbt-snowflake --force-reinstall` + +-------- + +Before installing the dbt Cloud CLI, make sure you have Python installed and your virtual environment venv or pyenv . If you already have a Python environment configured, you can skip to the [pip installation step](#install-dbt-cloud-cli-in-pip). + +### Install a virtual environment + +We recommend using virtual environments (venv) to namespace `cloud-cli`. + +1. Create a new virtual environment named "dbt-cloud" with this command: + ```shell + python3 -m venv dbt-cloud + ``` + +2. Activate the virtual environment each time you create a shell window or session, depending on your operating system: + + - For Mac and Linux, use: `source dbt-cloud/bin/activate`
+ - For Windows, use: `dbt-env\Scripts\activate` + +3. (Mac and Linux only) Create an alias to activate your dbt environment with every new shell window or session. You can add the following to your shell's configuration file (for example, `$HOME/.bashrc, $HOME/.zshrc`) while replacing `` with the path to your virtual environment configuration: + ```shell + alias env_dbt='source /bin/activate' + ``` + +### Install dbt Cloud CLI in pip + +1. (Optional) If you already have dbt Core installed, this installation will override that package. Check your dbt Core version in case you need to reinstall it later by running the following command : + + ```bash + dbt --version + ``` + +2. Make sure you're in your virtual environment and run the following command to install the dbt Cloud CLI: + + ```bash + pip install dbt --no-cache-dir + ``` + +3. (Optional) To revert to dbt Core, first uninstall both the dbt Cloud CLI and dbt Core. Then reinstall dbt Core. + + ```bash + pip uninstall dbt-core dbt + pip install dbt-adapter_name --force-reinstall + ``` + +4. Clone your repository to your local computer using `git clone`. For example, to clone a GitHub repo using HTTPS format, run `git clone https://github.com/YOUR-USERNAME/YOUR-REPOSITORY`. + +5. After cloning your repo, [configure](/docs/cloud/configure-cloud-cli) the dbt Cloud CLI for your dbt Cloud project. This lets you run dbt commands like `dbt compile` to compile your project and validate models and tests. You can also add, edit, and synchronize files with your repo. + +
+ + +
+ +## Update dbt Cloud CLI + +The following instructions explain how to update the dbt Cloud CLI to the latest version depending on your operating system. + +During the public preview period, we recommend updating before filing a bug report. This is because the API is subject to breaking changes. + + + + + + +To update the dbt Cloud CLI, run `brew upgrade dbt`. (You can also use `brew install dbt`). + + + + + +To update, follow the same process explained in [Windows](/docs/cloud/cloud-cli-installation?install=windows#install-dbt-cloud-cli) and replace the existing `dbt.exe` executable with the new one. + + + + + +To update, follow the same process explained in [Windows](/docs/cloud/cloud-cli-installation?install=linux#install-dbt-cloud-cli) and replace the existing `dbt` executable with the new one. + + + + + +To update: +- Make sure you're in your virtual environment +- Run `pip install --upgrade dbt`. + + + + + +## Using VS Code extensions + +Visual Studio (VS) Code extensions enhance command line tools by adding extra functionalities. The dbt Cloud CLI is fully compatible with dbt Core, however it doesn't support some dbt Core APIs required by certain tools, for example VS Code extensions. + +To use these extensions, such as dbt-power-user, with the dbt Cloud CLI, you can install it using Homebrew (along with dbt Core) and create an alias to run the dbt Cloud CLI as `dbt-cloud`. This allows dbt-power-user to continue to invoke dbt Core under the hood, alongside the dbt Cloud CLI. + + +## FAQs + +
+ +What's the difference between the dbt Cloud CLI and dbt Core? +The dbt Cloud CLI and dbt Core, an open-source project, are both command line tools that enable you to run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its features. + +
+ +
+How do I run both the dbt Cloud CLI and dbt Core? +For compatibility, both the dbt Cloud CLI and dbt Core are invoked by running dbt. This can create path conflicts if your operating system selects one over the other based on your $PATH environment variable (settings).
+ +If you have dbt Core installed locally, either: + +1. Install using the pip3 install dbt [pip](/docs/cloud/cloud-cli-installation?install=pip#install-dbt-cloud-cli) command. +2. Install natively, ensuring you either deactivate the virtual environment containing dbt Core or create an alias for the dbt Cloud CLI. +3. (Advanced users) Install natively, but modify the $PATH environment variable to correctly point to the dbt Cloud CLI binary to use both dbt Cloud CLI and dbt Core together. + +You can always uninstall the dbt Cloud CLI to return to using dbt Core. +
+ +
+How to create an alias? +To create an alias for the dbt Cloud CLI:
+ +1. Open your shell's profile configuration file. Depending on your shell and system, this could be ~/.bashrc, ~/.bash_profile, ~/.zshrc, or another file.
+ +2. Add an alias that points to the dbt Cloud CLI binary. For example:alias dbt-cloud="path_to_dbt_cloud_cli_binary + + Replace path_to_dbt_cloud_cli_binary with the actual path to the dbt Cloud CLI binary, which is /opt/homebrew/bin/dbt. With this alias, you can use the command dbt-cloud to invoke the dbt Cloud CLI.
+ +3. Save the file and then either restart your shell or run source on the profile file to apply the changes. +As an example, in bash you would run: source ~/.bashrc
+ +1. Test and use the alias to run commands:
+ - To run the dbt Cloud CLI, use the dbt-cloud command: dbt-cloud command_name. Replace 'command_name' with the specific dbt command you want to execute.
+ - To run the dbt Core, use the dbt command: dbt command_name. Replace 'command_name' with the specific dbt command you want to execute.
+ + +This alias will allow you to use the dbt-cloud command to invoke the dbt Cloud CLI while having dbt Core installed natively. +
+ +
+Why am I receiving a Session occupied error? +If you've ran a dbt command and receive a Session occupied error, you can reattach to your existing session with dbt reattach and then press Control-C and choose to cancel the invocation. +
diff --git a/website/docs/docs/cloud/configure-cloud-cli.md b/website/docs/docs/cloud/configure-cloud-cli.md new file mode 100644 index 00000000000..d6fca00cf25 --- /dev/null +++ b/website/docs/docs/cloud/configure-cloud-cli.md @@ -0,0 +1,101 @@ +--- +title: Configure dbt Cloud CLI +id: configure-cloud-cli +description: "Instructions on how to configure the dbt Cloud CLI" +sidebar_label: "Configure dbt Cloud CLI" +pagination_next: null +--- + +import CloudCLIFlag from '/snippets/_cloud-cli-flag.md'; + + + + +## Prerequisites + +- You must set up a project in dbt Cloud. + - **Note** — If you're using the dbt Cloud CLI, you can connect to your [data platform](/docs/cloud/connect-data-platform/about-connections) directly in the dbt Cloud interface and don't need a [`profiles.yml`](/docs/core/connect-data-platform/profiles.yml) file. +- You must have your [personal development credentials](/docs/dbt-cloud-environments#set-developer-credentials) set for that project. The dbt Cloud CLI will use these credentials, stored securely in dbt Cloud, to communicate with your data platform. +- You must be on dbt version 1.5 or higher. Refer to [dbt Cloud versions](/docs/dbt-versions/upgrade-core-in-cloud) to upgrade. + +## Configure the dbt Cloud CLI + +Once you install the dbt Cloud CLI, you need to configure it to connect to a dbt Cloud project. + +1. Ensure you meet the prerequisites above. + +2. Download your credentials from dbt Cloud by clicking on the **Try the dbt Cloud CLI** banner on the dbt Cloud homepage. Alternatively, if you're in dbt Cloud, you can download the credentials from the links provided based on your region: + + - North America: https://cloud.getdbt.com/cloud-cli + - EMEA: https://emea.dbt.com/cloud-cli + - APAC: https://au.dbt.com/cloud-cli + - North American Cell 1: `https:/ACCOUNT_PREFIX.us1.dbt.com/cloud-cli` + - Single-tenant: `https://YOUR_ACCESS_URL/cloud-cli` + +3. Follow the banner instructions and download the config file to: + - Mac or Linux: `~/.dbt/dbt_cloud.yml` + - Windows: `C:\Users\yourusername\.dbt\dbt_cloud.yml` + + The config file looks like this: + + ```yaml + version: "1" + context: + active-project: "" + active-host: "" + defer-env-id: "" + projects: + - project-id: "" + account-host: "" + api-key: "" + + - project-id: "" + account-host: "" + api-key: "" + + ``` + +4. After downloading the config file, navigate to a dbt project in your terminal: + + ```bash + cd ~/dbt-projects/jaffle_shop + ``` + +5. In your `dbt_project.yml` file, ensure you have or include a `dbt-cloud` section with a `project-id` field. The `project-id` field contains the dbt Cloud project ID you want to use. + + ```yaml + # dbt_project.yml + name: + + version: + ... + + dbt-cloud: + project-id: PROJECT_ID + ``` + + - To find your project ID, select **Develop** in the dbt Cloud navigation menu. You can use the URL to find the project ID. For example, in `https://cloud.getdbt.com/develop/26228/projects/123456`, the project ID is `123456`. + +6. You can now [use the dbt Cloud CLI](#use-the-dbt-cloud-cli) and run [dbt commands](/reference/dbt-commands) like `dbt compile`. With your repo recloned, you can add, edit, and sync files with your repo. + +### Set environment variables + +To set environment variables in the dbt Cloud CLI for your dbt project: + +1. Select the gear icon on the upper right of the page. +2. Then select **Profile Settings**, then **Credentials**. +3. Click on your project and scroll to the **Environment Variables** section. +4. Click **Edit** on the lower right and then set the user-level environment variables. + +## Use the dbt Cloud CLI + +- The dbt Cloud CLI uses the same set of [dbt commands](/reference/dbt-commands) and [MetricFlow commands](/docs/build/metricflow-commands) as dbt Core to execute the commands you provide. +- It allows you to automatically defer build artifacts to your Cloud project's production environment. +- It also supports [project dependencies](/docs/collaborate/govern/project-dependencies), which allows you to depend on another project using the metadata service in dbt Cloud. + - Project dependencies instantly connect to and reference (or `ref`) public models defined in other projects. You don't need to execute or analyze these upstream models yourself. Instead, you treat them as an API that returns a dataset. + +:::tip Use the --help flag +As a tip, most command-line tools have a `--help` flag to show available commands and arguments. Use the `--help` flag with dbt in two ways: +- `dbt --help`: Lists the commands available for dbt
+- `dbt run --help`: Lists the flags available for the `run` command +::: diff --git a/website/docs/docs/cloud/connect-data-platform/about-connections.md b/website/docs/docs/cloud/connect-data-platform/about-connections.md index 65bfac3a90d..1329d179900 100644 --- a/website/docs/docs/cloud/connect-data-platform/about-connections.md +++ b/website/docs/docs/cloud/connect-data-platform/about-connections.md @@ -3,6 +3,8 @@ title: "About data platform connections" id: about-connections description: "Information about data platform connections" sidebar_label: "About data platform connections" +pagination_next: "docs/cloud/connect-data-platform/connect-starburst-trino" +pagination_prev: null --- dbt Cloud can connect with a variety of data platform providers including: - [Amazon Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) @@ -13,11 +15,15 @@ dbt Cloud can connect with a variety of data platform providers including: - [Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) - [Starburst or Trino](/docs/cloud/connect-data-platform/connect-starburst-trino) +import MSCallout from '/snippets/_microsoft-adapters-soon.md'; + + + You can connect to your database in dbt Cloud by clicking the gear in the top right and selecting **Account Settings**. From the Account Settings page, click **+ New Project**. -These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/quickstarts) +These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/guides) ## IP Restrictions diff --git a/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md b/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md index 670b628547b..0186d821a54 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md @@ -3,6 +3,7 @@ title: "Connect Apache Spark" id: connect-apache-spark description: "Setup instructions for connecting Apache Spark to dbt Cloud" sidebar_label: "Connect Apache Spark" +pagination_next: null --- diff --git a/website/docs/docs/cloud/connect-data-platform/connect-databricks.md b/website/docs/docs/cloud/connect-data-platform/connect-databricks.md index b66f5890c61..032246ad16a 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-databricks.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-databricks.md @@ -26,6 +26,8 @@ Unity Catalog allows Databricks users to centrally manage all data assets, simpl To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Databricks-specific configuration](/reference/resource-configs/databricks-configs). +To grant users or roles database permissions (access rights and privileges), refer to the [example permissions](/reference/database-permissions/databricks-permissions) page. + To set up the Databricks connection, supply the following fields: diff --git a/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md b/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md index 72fe9e0449c..06b9dd62f1a 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md @@ -13,9 +13,11 @@ The following fields are required when creating a Postgres, Redshift, or AlloyDB | Port | Usually 5432 (Postgres) or 5439 (Redshift) | `5439` | | Database | The logical database to connect to and run queries against. | `analytics` | -**Note**: When you set up a Redshift or Postgres connection in dbt Cloud, SSL-related parameters aren't available as inputs. +**Note**: When you set up a Redshift or Postgres connection in dbt Cloud, SSL-related parameters aren't available as inputs. - + + +For dbt Cloud users, please log in using the default Database username and password. Note this is because [`IAM` authentication](https://docs.aws.amazon.com/redshift/latest/mgmt/generating-user-credentials.html) is not compatible with dbt Cloud. ### Connecting via an SSH Tunnel @@ -23,7 +25,7 @@ To connect to a Postgres, Redshift, or AlloyDB instance via an SSH tunnel, selec Once the connection is saved, a public key will be generated and displayed for the Connection. You can copy this public key to the bastion server to authorize dbt Cloud to connect to your database via the bastion server. - + #### About the Bastion server in AWS @@ -47,24 +49,53 @@ To configure the SSH tunnel in dbt Cloud, you'll need to provide the hostname/IP - Verify the bastion server has its network security rules set up to accept connections from the [dbt Cloud IP addresses](/docs/cloud/about-cloud/regions-ip-addresses) on whatever port you configured. - Set up the user account by using the bastion servers instance's CLI, The following example uses the username `dbtcloud:` - `sudo groupadd dbtcloud`
- - `sudo useradd -m -g dbtcloud dbtcloud`
- - `sudo su - dbtcloud`
- - `mkdir ~/.ssh`
- - `chmod 700 ~/.ssh`
- - `touch ~/.ssh/authorized_keys`
- - `chmod 600 ~/.ssh/authorized_keys`
- +```shell +sudo groupadd dbtcloud +sudo useradd -m -g dbtcloud dbtcloud +sudo su - dbtcloud +mkdir ~/.ssh +chmod 700 ~/.ssh +touch ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys +``` + - Copy and paste the dbt Cloud generated public key, into the authorized_keys file. The Bastion server should now be ready for dbt Cloud to use as a tunnel into the Redshift environment. +#### Intermittent connection issues + +
+ Database Error - could not connect to server: Connection timed out + You will have the following components when you configure a connection to a database using an SSH tunnel:
+ - An Elastic Load Balancer (ELB) or Network Load Balancing (NLB) instance.
+ - A bastion host (or jump server) running the sshd process.
+ - A Database (such as Redshift cluster)

+ +dbt Cloud establishes an SSH tunnel connection through the ELB or NLB to the sshd process. This is responsible for routing traffic to the database. When dbt initiates a job run, an SSH tunnel is created at the start of the run. If this SSH tunnel fails at any point, the job will also fail.
+ + The most common causes of tunnel failures are:
+ - The SSH daemon terminates the session due to an idle timeout.
+ - The ELB or NLB terminates the connection when it's idle.
+ +dbt Cloud sets a value for its SSH tunnel called `ServerAliveInterval` and `ServerAliveCountMax` that polls the connection every 30 seconds and the underlying OS in our run "pods" will terminate the connection if the `sshd` process fails to respond after 300s. This will, in many cases, prevent an idle timeout entirely so long as the customer is not using ELB with a firewall-level idle timeout of less than 30 seconds. However, if the customer is using ELB and is using an Idle Connection Timeout of less than 30s, this will be insufficient to prevent tunnels from being terminated.
+ +Some versions of Linux used on bastion hosts use a version of `sshd` with additional idle timeout settings `ClientAliveCountMax`. This value sets the number of client alive messages that may be sent without `sshd` receiving any messages back from the client. If this threshold is reached while client alive messages are being sent, `sshd` will disconnect the client, terminating the session. The client-alive mechanism is helpful when the client or server needs to know when a connection has become inactive. The default value is 3.

+ +`ClientAliveInterval`:
+This value sets a timeout interval in seconds after which if no data has been received from the client, `sshd` will send a message through the encrypted channel to request a response from the client. The default is 0, indicating that these messages will not be sent to the client.
+ +Using default values, tunnels could be terminated prematurely by `sshd`. To solve this problem, the `/etc/ssh/sshd_config` file on the bastion host can be configured with the following values:

+- `ClientAliveCountMax` 10
+- `ClientAliveInterval` 30
+where `ClientAliveCountMax` should be set to a non-zero value and `ClientAliveInterval` should be a value less than the ELB or NLB idle timeout value.
+ +With these settings, unresponsive SSH clients will be disconnected after approximately 300 seconds, helping to prevent tunnel failures. +
+ + ## Configuration To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Redshift-specific configuration](/reference/resource-configs/redshift-configs). + +To grant users or roles database permissions (access rights and privileges), refer to the [Redshift permissions](/reference/database-permissions/redshift-permissions) page or [Postgres permissions](/reference/database-permissions/postgres-permissions) page. diff --git a/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md b/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md index 4f31c56e8aa..5f1c4cae725 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md @@ -15,7 +15,7 @@ The following fields are required when creating a Snowflake connection | Warehouse | The virtual warehouse to use for running queries. | `transforming` | -**Note:** A crucial part of working with dbt atop Snowflake is ensuring that users (in development environments) and/or service accounts (in deployment to production environments) have the correct permissions to take actions on Snowflake! Here is documentation of some [example permissions to configure Snowflake access](/reference/snowflake-permissions). +**Note:** A crucial part of working with dbt atop Snowflake is ensuring that users (in development environments) and/or service accounts (in deployment to production environments) have the correct permissions to take actions on Snowflake! Here is documentation of some [example permissions to configure Snowflake access](/reference/database-permissions/snowflake-permissions). ### Username / Password @@ -30,31 +30,34 @@ to authenticate dbt Cloud to run queries against Snowflake on behalf of a Snowfl ### Key Pair + **Available in:** Development environments, Deployment environments The `Keypair` auth method uses Snowflake's [Key Pair Authentication](https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication) to authenticate Development or Deployment credentials for a dbt Cloud project. -After [generating an encrypted key pair](https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication), be sure to set the `rsa_public_key` for the Snowflake user to authenticate in dbt Cloud: +1. After [generating an encrypted key pair](https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication), be sure to set the `rsa_public_key` for the Snowflake user to authenticate in dbt Cloud: ```sql alter user jsmith set rsa_public_key='MIIBIjANBgkqh...'; ``` -Finally, set the "Private Key" and "Private Key Passphrase" fields in the "Edit -Credentials" page to finish configuring dbt Cloud to authenticate with Snowflake -using a key pair. - -**Note:** At this time ONLY Encrypted Private Keys are supported by dbt Cloud, and the keys must be of size 4096 or smaller. +2. Finally, set the **Private Key** and **Private Key Passphrase** fields in the **Credentials** page to finish configuring dbt Cloud to authenticate with Snowflake using a key pair. + + **Note:** At this time ONLY Encrypted Private Keys are supported by dbt Cloud, and the keys must be of size 4096 or smaller. -In order to successfully fill in the Private Key field, you **must** include the commented lines below when you add the passphrase. Leaving the `PRIVATE KEY PASSPHRASE` field empty will return an error - have a look at the examples below: +3. To successfully fill in the Private Key field, you **must** include commented lines when you add the passphrase. Leaving the **Private Key Passphrase** field empty will return an error. If you're receiving a `Could not deserialize key data` or `JWT token` error, refer to [Troubleshooting](#troubleshooting) for more info. **Example:** + ```sql -----BEGIN ENCRYPTED PRIVATE KEY----- -< encrypted private key contents here > +< encrypted private key contents here - line 1 > +< encrypted private key contents here - line 2 > +< ... > -----END ENCRYPTED PRIVATE KEY----- ``` - + + ### Snowflake OAuth @@ -68,3 +71,36 @@ more information on configuring a Snowflake OAuth connection in dbt Cloud, pleas ## Configuration To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Snowflake-specific configuration](/reference/resource-configs/snowflake-configs). + +## Troubleshooting + + +If you're receiving a `Could not deserialize key data` or `JWT token` error, refer to the following causes and solutions: + +
+ +Error: Could not deserialize key data + + - Possible cause + + - This could be because of mistakes like not copying correctly, missing dashes, or leaving out commented lines. + - Solution + + - You can copy the key from its source and paste it into a text editor to verify it before using it in dbt Cloud. + +
+ +
+Error: JWT token + + - Possible causes + + - This could be a transient issue between Snowflake and dbt Cloud. When connecting to Snowflake, dbt gets a JWT token valid for only 60 seconds. If there's no response from Snowflake within this time, you might see a `JWT token is invalid` error in dbt Cloud. + - The public key was not entered correctly in Snowflake. + + - Solutions + + - dbt needs to retry connections to Snowflake. + - Confirm and enter Snowflake's public key correctly. Additionally, you can reach out to Snowflake for help or refer to this Snowflake doc for more info: [Key-Based Authentication Failed with JWT token is invalid Error](https://community.snowflake.com/s/article/Key-Based-Authentication-Failed-with-JWT-token-is-invalid-Error). + +
diff --git a/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-ide.md b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-ide.md new file mode 100644 index 00000000000..3c41432bc62 --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-ide.md @@ -0,0 +1,37 @@ +--- +title: "dbt Cloud IDE" +description: "Learn how to configure Git in dbt Cloud" +pagination_next: "docs/cloud/dbt-cloud-ide/develop-in-the-cloud" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + +
\ No newline at end of file diff --git a/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md index cfae00b960e..0ceb4929530 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md @@ -3,6 +3,7 @@ title: "Tips and tricks" id: dbt-cloud-tips description: "Check out any dbt Cloud and IDE-related tips." sidebar_label: "Tips and tricks" +pagination_next: null --- # dbt Cloud tips @@ -16,7 +17,7 @@ There are default keyboard shortcuts that can help make development more product - Press Fn-F1 to view a full list of the editor shortcuts - Command-O on macOS or Control-O on Windows to select a file to open - Command-P/Command-Shift-P on macOS or Control-P/Control-Shift-P on Windows to see the command palette -- Hold Option-click-on-area on macOS or Hold-Alt-click-on-area on Windows to select multiple lines and perform a multi-edit. You can also press Command-E to perform this operation on the command line. +- Hold Option-click-on-area or press Shift-Option-Command on macOS or Hold-Alt-click-on-area on Windows to select multiple lines and perform a multi-edit. You can also press Command-E to perform this operation on the command line. - Command-Enter on macOS or Control-Enter on Windows to Preview your code - Command-Shift-Enter on macOS or Control-Shift-Enter on Windows to Compile - Highlight a portion of code and use the above shortcuts to Preview or Compile code @@ -45,7 +46,7 @@ There are default keyboard shortcuts that can help make development more product - Use [severity](/reference/resource-configs/severity) thresholds to set an acceptable number of failures for a test. - Use [incremental_strategy](/docs/build/incremental-models#about-incremental_strategy) in your incremental model config to implement the most effective behavior depending on the volume of your data and reliability of your unique keys. - Set `vars` in your `dbt_project.yml` to define global defaults for certain conditions, which you can then override using the `--vars` flag in your commands. -- Use [for loops](/guides/advanced/using-jinja#use-a-for-loop-in-models-for-repeated-sql) in Jinja to [DRY](https://docs.getdbt.com/terms/dry) up repetitive logic, such as selecting a series of columns that all require the same transformations and naming patterns to be applied. +- Use [for loops](/guides/using-jinja?step=3) in Jinja to DRY up repetitive logic, such as selecting a series of columns that all require the same transformations and naming patterns to be applied. - Instead of relying on post-hooks, use the [grants config](/reference/resource-configs/grants) to apply permission grants in the warehouse resiliently. - Define [source-freshness](/docs/build/sources#snapshotting-source-data-freshness) thresholds on your sources to avoid running transformations on data that has already been processed. - Use the `+` operator on the left of a model `dbt build --select +model_name` to run a model and all of its upstream dependencies. Use the `+` operator on the right of the model `dbt build --select model_name+` to run a model and everything downstream that depends on it. @@ -58,6 +59,6 @@ There are default keyboard shortcuts that can help make development more product ## Related docs -- [Quickstart guide](/quickstarts) +- [Quickstart guide](/guides) - [About dbt Cloud](/docs/cloud/about-cloud/dbt-cloud-features) - [Develop in the Cloud](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) diff --git a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md index 000723a933c..9fc382f0217 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md @@ -1,23 +1,31 @@ --- -title: "Develop in the IDE" +title: "About the dbt Cloud IDE" id: develop-in-the-cloud description: "Develop, test, run, and build in the Cloud IDE. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly" -sidebar_label: Develop in the IDE +sidebar_label: About the IDE tags: [IDE] +pagination_next: "docs/cloud/dbt-cloud-ide/ide-user-interface" +pagination_prev: null --- -The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. The IDE leverages the open-source [dbt-rpc](/reference/commands/rpc) plugin to recompile only the changes made in your project. +The dbt Cloud integrated development environment (IDE) is a single web-based interface for building, testing, running, and version-controlling dbt projects. It compiles dbt code into SQL and executes it directly on your database. +The dbt Cloud IDE offers several [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) for faster and more efficient data platform development and governance: -## Prerequisites +- Syntax highlighting for SQL: Makes it easy to distinguish different parts of your code, reducing syntax errors and enhancing readability. +- Auto-completion: Suggests table names, arguments, and column names as you type, saving time and reducing typos. +- Code [formatting and linting](/docs/cloud/dbt-cloud-ide/lint-format): Help standardize and fix your SQL code effortlessly. +- Navigation tools: Easily move around your code, jump to specific lines, find and replace text, and navigate between project files. +- Version control: Manage code versions with a few clicks. -To develop in the Cloud IDE, make sure you have the following: +These [features](#dbt-cloud-ide-features) create a powerful editing environment for efficient SQL coding, suitable for both experienced and beginner developers. -- A [dbt Cloud account](https://cloud.getdbt.com/) and [Developer seat license](/docs/cloud/manage-access/seats-and-users) -- A git repository set up and git provider must have `write` access enabled. See [Connecting your GitHub Account](/docs/cloud/git/connect-github) or [Importing a project by git URL](/docs/cloud/git/import-a-project-by-git-url) for detailed setup instructions -- A dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections) -- A [development environment and development credentials](#access-the-cloud-ide) set up -- The environment must be on dbt version 1.0 or higher + + + + + + :::tip Disable ad blockers @@ -25,21 +33,16 @@ To improve your experience using dbt Cloud, we suggest that you turn off ad bloc ::: -## Develop in the Cloud IDE - -The Cloud IDE is a powerful tool that can help streamline and govern your data platform development process. It offers a range of [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) that can help make your data platform development process faster and more efficient. Some of the editing features include: - -- The IDE has syntax highlighting for SQL. This makes it easy to visually distinguish between different parts of your code. This helps prevent syntax errors and improve readability. -- Use the IDE built-in auto-completion, which suggests table names, arguments, and column names as you type. This saves time and reduces the likelihood of typos or errors in your code. -- The code [formatting and linting](/docs/cloud/dbt-cloud-ide/lint-format) tools allow you to standardize and fix your SQL code with ease. -- The IDE has a range of navigation tools, making it easy to move around your code with ease. You can quickly jump to specific lines of code, find and replace text, and navigate between different files in your project. -- Use the version control menu and features to version-control your code with just a few clicks. +## Prerequisites -All of these [features](#cloud-ide-features) work together to create a powerful editing environment that can help you write and maintain high-quality SQL code in less time. Whether you're a seasoned developer or just starting out, the Cloud IDE has everything you need to be productive, collaborative, and efficient. +- A [dbt Cloud account](https://cloud.getdbt.com/) and [Developer seat license](/docs/cloud/manage-access/seats-and-users) +- A git repository set up and git provider must have `write` access enabled. See [Connecting your GitHub Account](/docs/cloud/git/connect-github) or [Importing a project by git URL](/docs/cloud/git/import-a-project-by-git-url) for detailed setup instructions +- A dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections) +- A [development environment and development credentials](#access-the-cloud-ide) set up +- The environment must be on dbt version 1.0 or higher - -## Cloud IDE features +## dbt Cloud IDE features The dbt Cloud IDE comes with [tips](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips) and [features](/docs/cloud/dbt-cloud-ide/ide-user-interface) that make it easier for you to develop, build, compile, run, and test data models. @@ -55,7 +58,7 @@ To stay informed on IDE updates, read [dbt Cloud IDE release notes](/tags/ide), | **File state indicators** | Ability to see when changes or actions have been made to the file. The indicators **M, D, A,** and **•** appear to the right of your file or folder name and indicate the actions performed:

- Unsaved **(•)** — The IDE detects unsaved changes to your file/folder
- Modification **(M)** — The IDE detects a modification of existing files/folders
- Added **(A)** — The IDE detects added files
- Deleted **(D)** — The IDE detects deleted files. | **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Execute git commands using the git button
- Link to the repo directly by clicking the branch name | | **Project documentation** | Generate and view your [project documentation](/docs/collaborate/build-and-view-your-docs) for your dbt project in real-time. You can inspect and verify what your project's documentation will look like before you deploy your changes to production. | -| **Preview and Compile button** | You can run your code against your data platform by clicking the **Preview**. Use the **Compile** button in the IDE to generate executable SQL, which occurs locally within dbt. | +| **Preview and Compile button** | You can [compile or preview](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section) code, a snippet of dbt code, or one of your dbt models after editing and saving. | | **Build, test, and run button** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. | **Command bar** | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking History on the left of the bar. | **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right-clicking on the breadcrumb file. @@ -75,7 +78,7 @@ To stay informed on IDE updates, read [dbt Cloud IDE release notes](/tags/ide), There are three start-up states when using or launching the Cloud IDE: - **Creation start —** This is the state where you are starting the IDE for the first time. You can also view this as a *cold start* (see below), and you can expect this state to take longer because the git repository is being cloned. -- **Cold start —** This is the process of starting a new develop session, which will be available for you for three hours. The environment automatically turns off three hours after the last activity with the rpc server. This includes compile, preview, or any dbt invocation, however, it *does not* include editing and saving a file. +- **Cold start —** This is the process of starting a new develop session, which will be available for you for three hours. The environment automatically turns off three hours after the last activity. This includes compile, preview, or any dbt invocation, however, it *does not* include editing and saving a file. - **Hot start —** This is the state of resuming an existing or active develop session within three hours of the last activity. ### Work retention @@ -85,14 +88,14 @@ The Cloud IDE needs explicit action to save your changes. There are three ways y - **Unsaved, local code —** The browser stores your code only in its local storage. In this state, you might need to commit any unsaved changes in order to switch branches or browsers. If you have saved and committed changes, you can access the "Change branch" option even if there are unsaved changes. But if you attempt to switch branches without saving changes, a warning message will appear, notifying you that you will lose any unsaved changes. -- **Saved but uncommitted code —** When you save a file, the data gets stored in durable, long-term storage. To access the Change branch option, you must "Commit and sync" or "Revert" changes - changing branches isn't available for saved-but-uncommitted code. +- **Saved but uncommitted code —** When you save a file, the data gets stored in durable, long-term storage, but isn't synced back to git. To switch branches using the **Change branch** option, you must "Commit and sync" or "Revert" changes. Changing branches isn't available for saved-but-uncommitted code. This is to ensure your uncommitted changes don't get lost. - **Committed code —** This is stored in the branch with your git provider and you can check out other (remote) branches. ## Access the Cloud IDE -:::info📌 +:::tip Disable ad blockers -New to dbt? Check out our [quickstart guides](/quickstarts) to build your first dbt project in the Cloud IDE! +To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. This is because some project file names, such as `google_adwords.sql`, might resemble ad traffic and trigger ad blockers. ::: @@ -156,13 +159,15 @@ The dbt Cloud IDE makes it possible to [build and view](/docs/collaborate/build-
- What is the difference between developing on the Cloud IDE and on the CLI? + What is the difference between developing on the dbt Cloud IDE, the dbt Cloud CLI, and dbt Core?
-
There are two main ways to develop with dbt: using the web-based IDE in dbt Cloud or using the command-line interface (CLI) in dbt Core:

- - dbt Cloud IDE dbt Cloud is a web-based application that allows you to develop dbt projects with the IDE, includes a purpose-built scheduler, and provides an easier way to share your dbt documentation with your team. The IDE is a faster and more reliable way to deploy your dbt models and provides a real-time editing and execution environment for your dbt project.

- - dbt Core CLI The command line interface (CLI) uses dbt Core, an open-source software that’s freely available. You can build your dbt project in a code editor, like Jetbrains or VSCode, and run dbt commands from the command line. +
You can develop dbt using the web-based IDE in dbt Cloud or on the command line interface using the dbt Cloud CLI or open-source dbt Core, all of which enable you to execute dbt commands. The key distinction between the dbt Cloud CLI and dbt Core is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its features.

+ + dbt Cloud IDE: dbt Cloud is a web-based application that allows you to develop dbt projects with the IDE, includes a purpose-built scheduler, and provides an easier way to share your dbt documentation with your team. The IDE is a faster and more reliable way to deploy your dbt models and provides a real-time editing and execution environment for your dbt project.

+ + dbt Cloud CLI: The dbt Cloud CLI allows you to run dbt commands against your dbt Cloud development environment from your local command line or code editor. It supports cross-project ref, speedier, lower-cost builds, automatic deferral of build artifacts, and more.

+ + dbt Core: dbt Core is an open-sourced software that’s freely available. You can build your dbt project in a code editor, and run dbt commands from the command line.
diff --git a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md index 63a4f9a0312..05910b23e7f 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md @@ -28,7 +28,7 @@ The IDE streamlines your workflow, and features a popular user interface layout 4. **File Explorer —** The File Explorer shows the filetree of your repository. You can: - Click on any file in the filetree to open the file in the File Editor. - Click and drag files between directories to move files. - - Right click a file to access the sub-menu options like duplicate file, copy file name, copy as `ref`, rename, delete. + - Right-click a file to access the sub-menu options like duplicate file, copy file name, copy as `ref`, rename, delete. - **Note**: To perform these actions, the user must not be in `read-only` mode, which generally happens when the user is viewing the default branch. - Use file indicators, located to the right of your files or folder name, to see when changes or actions were made: * Unsaved (•) — The IDE detects unsaved changes to your file/folder @@ -36,11 +36,13 @@ The IDE streamlines your workflow, and features a popular user interface layout * Added (A) — The IDE detects added files * Deleted (D) — The IDE detects deleted files. - + 5. **Command bar —** The Command bar, located in the lower left of the IDE, is used to invoke [dbt commands](/reference/dbt-commands). When a command is invoked, the associated logs are shown in the Invocation History Drawer. -6. **IDE Status button —** The IDE Status button, located on the lower right of the IDE, displays the current IDE status. If there is an error in the status or in the dbt code that stops the project from parsing, the button will turn red and display "Error". If there aren't any errors, the button will display a green "Ready" status. To access the [IDE Status modal](#modals-and-menus), simply click on this button. +6. **Defer to production —** The **Defer to production** toggle allows developers to only build and run and test models they've edited without having to first run and build all the models that come before them (upstream parents). Refer to [Using defer in dbt Cloud](/docs/cloud/about-cloud-develop-defer#defer-in-the-dbt-cloud-ide) for more info. + +7. **Status button —** The IDE Status button, located on the lower right of the IDE, displays the current IDE status. If there is an error in the status or in the dbt code that stops the project from parsing, the button will turn red and display "Error". If there aren't any errors, the button will display a green "Ready" status. To access the [IDE Status modal](#modals-and-menus), simply click on this button. ## Editing features @@ -72,23 +74,35 @@ The IDE features some delightful tools and layouts to make it easier for you to - **Git Diff View —** Clicking on a file in the **Changes** section of the **Version Control Menu** will open the changed file with Git Diff view. The editor will show the previous version on the left and the in-line changes made on the right. -- **Markdown Preview console tab —** The Markdown Preview console tab shows a preview of your .md file's markdown code in your repository, and updates it automatically as you edit your code. +- **Markdown Preview console tab —** The Markdown Preview console tab shows a preview of your .md file's markdown code in your repository and updates it automatically as you edit your code. - **CSV Preview console tab —** The CSV Preview console tab displays the data from your CSV file in a table, which updates automatically as you edit the file in your seed directory. ## Console section + The console section, located below the File editor, includes various console tabs and buttons to help you with tasks such as previewing, compiling, building, and viewing the . Refer to the following sub-bullets for more details on the console tabs and buttons. -1. **Preview button —** When you click on the Preview button, it runs the SQL in the active file editor regardless of whether you have saved it or not, and sends the results to the Results console tab. - * To prevent the IDE from returning too much data and causing browser problems, a limit of 500 is automatically added to queries executed via the Preview Button. However, you can change this by adding `limit your_number` at the end of your SQL statement. For example, `SELECT * FROM` table `limit 100` will return up to 100 rows. Remember that you must write the `limit your_number` explicitly and cannot derive it from a macro. - * The IDE also supports `SELECT TOP #`, which specifies the number of records to return. +1. **Preview button —** When you click on the Preview button, it runs the SQL in the active file editor regardless of whether you have saved it or not and sends the results to the **Results** console tab. You can preview a selected portion of saved or unsaved code by highlighting it and then clicking the **Preview** button. + +
+Row limits in IDE +The dbt Cloud IDE returns default row limits, however, you can also specify the number of records returned. Refer to the following sub-bullets for more info:

+
    +
  • 500-row limit: To prevent the IDE from returning too much data and causing browser problems, dbt automatically sets a 500-row limit when using the Preview Button. You can modify this by adding limit your_number at the end of your SQL statement. For example, SELECT * FROM table limit 100 will return up to 100 rows. Remember that you must write the limit your_number explicitly and cannot derive it from a macro.
  • +
  • Change row limit default: In dbt version 1.6 or higher, you have the ability to change the default limit of 500 rows shown in the Results tab when you run a query. To adjust the setting you can click on Change row display next to the displayed rows. Keep in mind that you can't set it higher than 10,000 rows. If you refresh the page or close your development session, the default limit will go back to 500 rows.
  • +
  • Specify records returned: The IDE also supports SELECT TOP #, which specifies the number of records to return.
  • +
+
-2. **Compile button —** The Compile button compiles the SQL code from the active File Editor, irrespective of its save status, and outputs it to the Compiled Code tab. +2. **Compile button —** The **Compile** button compiles the saved or unsaved SQL code and displays it in the **Compiled Code** tab. -3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. + +Starting from dbt v1.6 or higher, when you save changes to a model, you can compile its code with the model's specific context. This context is similar to what you'd have when building the model and involves useful context variables like `{{ this }} `or `{{ is_incremental() }}`. + +3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource, and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. 3. **Format button —** The editor has a **Format** button that can reformat the contents of your files. For SQL files, it uses either `sqlfmt` or `sqlfluff`, and for Python files, it uses `black`. @@ -106,9 +120,10 @@ The console section, located below the File editor, includes various console tab ## Invocation history -The Invocation History Drawer stores information on dbt invocations in the IDE. When you invoke a command (like execute a dbt command such as `dbt run`), the associated logs are displayed in the Invocation History Drawer. -You can open the drawer multiple ways: +The Invocation History Drawer stores information on dbt invocations in the IDE. When you invoke a command, like executing a dbt command such as `dbt run`, the associated logs are displayed in the Invocation History Drawer. + +You can open the drawer in multiple ways: - Clicking the `^` icon next to the Command bar on the lower left of the page - Typing a dbt command and pressing enter - Or pressing Control-backtick (or Ctrl + `) @@ -117,15 +132,15 @@ You can open the drawer multiple ways: 1. **Invocation History list —** The left-hand panel of the Invocation History Drawer displays a list of previous invocations in the IDE, including the command, branch name, command status, and elapsed time. -2. **Invocation Summary —** The Invocation Summary, located above **System Logs**, displays information about a selected command from the Invocation History list , such as the command, its status (`Running` if it's still running), the git branch that was active during the command, and the time the command was invoked. +2. **Invocation Summary —** The Invocation Summary, located above **System Logs**, displays information about a selected command from the Invocation History list, such as the command, its status (`Running` if it's still running), the git branch that was active during the command, and the time the command was invoked. -3. **System Logs toggle —** The System Logs toggle, located under the Invocation Summary, allows the user to see the full stdout and debug logs for entirety of the invoked command. +3. **System Logs toggle —** The System Logs toggle, located under the Invocation Summary, allows the user to see the full stdout and debug logs for the entirety of the invoked command. -4. **Command Control button —** Use the Command Control button, located on the right-side, to control your invocation and cancel or rerun a selected run. +4. **Command Control button —** Use the Command Control button, located on the right side, to control your invocation and cancel or rerun a selected run. -5. **Node Summary tab —** Clicking on the Results Status Tabs will filter the Node Status List based on their corresponding status. The available statuses are Pass (successful invocation of a node), Warn (test executed with warning), Error (database error or test failure), Skip (nodes not run due to upstream error), and Queued (nodes that have not executed yet). +5. **Node Summary tab —** Clicking on the Results Status Tabs will filter the Node Status List based on their corresponding status. The available statuses are Pass (successful invocation of a node), Warn (test executed with a warning), Error (database error or test failure), Skip (nodes not run due to upstream error), and Queued (nodes that have not executed yet). 6. **Node result toggle —** After running a dbt command, information about each executed node can be found in a Node Result toggle, which includes a summary and debug logs. The Node Results List lists every node that was invoked during the command. @@ -135,12 +150,12 @@ You can open the drawer multiple ways: ## Modals and Menus Use menus and modals to interact with IDE and access useful options to help your development workflow. -- **Editor tab menu —** To interact with open editor tabs, right-click any tab to access the helpful options in the file tab menu. +- **Editor tab menu —** To interact with open editor tabs, right-click any tab to access the helpful options in the file tab menu. - **File Search —** You can easily search for and navigate between files using the File Navigation menu, which can be accessed by pressing Command-O or Control-O or clicking on the 🔍 icon in the File Explorer. -- **Global Command Palette—** The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, compile, and preview actions, among others. To open the menu, use Command-P or Control-P. +- **Global Command Palette—** The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, and compile, and preview actions, among others. To open the menu, use Command-P or Control-P. - **IDE Status modal —** The IDE Status modal shows the current error message and debug logs for the server. This also contains an option to restart the IDE. Open this by clicking on the IDE Status button. @@ -159,7 +174,7 @@ Use menus and modals to interact with IDE and access useful options to help your * Toggling between dark or light mode for a better viewing experience * Restarting the IDE - * Fully recloning your repository to refresh your git state and viewing status details + * Fully recloning your repository to refresh your git state and view status details * Viewing status details, including the IDE Status modal. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md b/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md index 099641cc22f..f145e76df11 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md @@ -45,7 +45,11 @@ With the dbt Cloud IDE, you can seamlessly use [SQLFluff](https://sqlfluff.com/) - Works with Jinja and SQL, - Comes with built-in [linting rules](https://docs.sqlfluff.com/en/stable/rules.html). You can also [customize](#customize-linting) your own linting rules. - Empowers you to [enable linting](#enable-linting) with options like **Lint** (displays linting errors and recommends actions) or **Fix** (auto-fixes errors in the IDE). -- Displays a **Code Quality** tab to view code errors, and provides code quality visibility and management. +- Displays a **Code Quality** tab to view code errors, and provides code quality visibility and management. + +:::info Ephemeral models not supported +Linting doesn't support ephemeral models in dbt v1.5 and lower. Refer to the [FAQs](#faqs) for more info. +::: ### Enable linting @@ -63,7 +67,7 @@ With the dbt Cloud IDE, you can seamlessly use [SQLFluff](https://sqlfluff.com/) ### Customize linting -SQLFluff is a configurable SQL linter, which means you can configure your own linting rules instead of using the default linting settings in the IDE. +SQLFluff is a configurable SQL linter, which means you can configure your own linting rules instead of using the default linting settings in the IDE. You can exclude files and directories by using a standard `.sqlfluffignore` file. Learn more about the syntax in the [.sqlfluffignore syntax docs](https://docs.sqlfluff.com/en/stable/configuration.html#id2). To configure your own linting rules: @@ -76,7 +80,7 @@ To configure your own linting rules: :::tip Configure dbtonic linting rules -Use the following code example to incorporate well-written dbt code (or dbtonic) to your linting: +Refer to the [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) to add the dbt code (or dbtonic) rules we use for our own projects:
dbtonic config code example provided by dbt Labs @@ -122,6 +126,8 @@ capitalisation_policy = lower group_by_and_order_by_style = implicit ```
+ +For more info on styling best practices, refer to [How we style our SQL](/best-practices/how-we-style/2-how-we-style-our-sql). ::: @@ -221,6 +227,12 @@ Currently, running SQLFluff commands from the terminal isn't supported. Make sure you're on a development branch. Formatting or Linting isn't available on "main" or "read-only" branches. +
+Why is there inconsistent SQLFluff behavior when running outside the dbt Cloud IDE (such as a GitHub Action)? +— Double-check your SQLFluff version matches the one in dbt Cloud IDE (found in the Code Quality tab after a lint operation).

+— If your lint operation passes despite clear rule violations, confirm you're not linting models with ephemeral models. Linting doesn't support ephemeral models in dbt v1.5 and lower. +
+ ## Related docs - [User interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) diff --git a/website/docs/docs/cloud/git/authenticate-azure.md b/website/docs/docs/cloud/git/authenticate-azure.md index 9e755519e67..42028bf993b 100644 --- a/website/docs/docs/cloud/git/authenticate-azure.md +++ b/website/docs/docs/cloud/git/authenticate-azure.md @@ -3,10 +3,11 @@ title: "Authenticate with Azure DevOps" id: "authenticate-azure" description: "dbt Cloud developers need to authenticate with Azure DevOps." sidebar_label: "Authenticate with Azure DevOps" +pagination_next: null --- -If you use the dbt Cloud IDE to collaborate on your team's Azure DevOps dbt repo, you need to [link your dbt Cloud profile to Azure DevOps](#link-your-dbt-cloud-profile-to-azure-devops), which provides an extra layer of authentication. +If you use the dbt Cloud IDE or dbt Cloud CLI to collaborate on your team's Azure DevOps dbt repo, you need to [link your dbt Cloud profile to Azure DevOps](#link-your-dbt-cloud-profile-to-azure-devops), which provides an extra layer of authentication. ## Link your dbt Cloud profile to Azure DevOps @@ -26,3 +27,4 @@ You will be directed back to dbt Cloud, and your profile should be linked. You a ## FAQs + diff --git a/website/docs/docs/cloud/git/connect-azure-devops.md b/website/docs/docs/cloud/git/connect-azure-devops.md index a84e593a1e2..c138e042abc 100644 --- a/website/docs/docs/cloud/git/connect-azure-devops.md +++ b/website/docs/docs/cloud/git/connect-azure-devops.md @@ -1,6 +1,7 @@ --- title: "Connect to Azure DevOps" id: "connect-azure-devops" +pagination_next: "docs/cloud/git/setup-azure" --- @@ -13,7 +14,7 @@ Connect your Azure DevOps cloud account in dbt Cloud to unlock new product exper - Import new Azure DevOps repos with a couple clicks during dbt Cloud project setup. - Clone repos using HTTPS rather than SSH - Enforce user authorization with OAuth 2.0. -- Carry Azure DevOps user repository permissions (read / write access) through to dbt Cloud IDE's git actions. +- Carry Azure DevOps user repository permissions (read / write access) through to dbt Cloud IDE or dbt Cloud CLI's git actions. - Trigger Continuous integration (CI) builds when pull requests are opened in Azure DevOps. @@ -23,3 +24,4 @@ To connect Azure DevOps in dbt Cloud: 2. dbt Cloud developers need to [personally authenticate with Azure DevOps](/docs/cloud/git/authenticate-azure) from dbt Cloud. +If you're a Business Critical customer using [IP restrictions](/docs/cloud/secure/ip-restrictions), ensure you've added the appropriate Azure DevOps CIDRs to your IP restriction rules, or else the Azure DevOps connection will fail. diff --git a/website/docs/docs/cloud/git/connect-github.md b/website/docs/docs/cloud/git/connect-github.md index d5ead96d940..ff0f2fff18f 100644 --- a/website/docs/docs/cloud/git/connect-github.md +++ b/website/docs/docs/cloud/git/connect-github.md @@ -56,7 +56,7 @@ If you are your GitHub organization owner, you can also configure the dbt Cloud ## Personally authenticate with GitHub -Once the dbt Cloud admin has [set up a connection](docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization GitHub account, you need to personally authenticate, which improves the security of dbt Cloud by enabling you to log in using OAuth through GitHub. +Once the dbt Cloud admin has [set up a connection](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization GitHub account, you need to personally authenticate, which improves the security of dbt Cloud by enabling you to log in using OAuth through GitHub. :::infoGitHub profile connection - dbt Cloud developers on the [Enterprise plan](https://www.getdbt.com/pricing/) must each connect their GitHub profiles to dbt Cloud. This is because the dbt Cloud IDE verifies every developer's read / write access for the dbt repo. @@ -74,9 +74,9 @@ To connect a personal GitHub account: 4. Once you approve authorization, you will be redirected to dbt Cloud, and you should now see your connected account. -The next time you log into dbt Cloud, you will be able to do so via OAuth through GitHub, and if you're on the Enterprise plan, you're ready to use the dbt Cloud IDE. +The next time you log into dbt Cloud, you will be able to do so via OAuth through GitHub, and if you're on the Enterprise plan, you're ready to use the dbt Cloud IDE or dbt Cloud CLI. ## FAQs - + diff --git a/website/docs/docs/cloud/git/connect-gitlab.md b/website/docs/docs/cloud/git/connect-gitlab.md index 1ec8fb08817..e55552e2d86 100644 --- a/website/docs/docs/cloud/git/connect-gitlab.md +++ b/website/docs/docs/cloud/git/connect-gitlab.md @@ -8,7 +8,7 @@ id: "connect-gitlab" Connecting your GitLab account to dbt Cloud provides convenience and another layer of security to dbt Cloud: - Import new GitLab repos with a couple clicks during dbt Cloud project setup. - Clone repos using HTTPS rather than SSH. -- Carry GitLab user permissions through to dbt Cloud IDE's git actions. +- Carry GitLab user permissions through to dbt Cloud or dbt Cloud CLI's git actions. - Trigger [Continuous integration](/docs/deploy/continuous-integration) builds when merge requests are opened in GitLab. The steps to integrate GitLab in dbt Cloud depend on your plan. If you are on: @@ -35,7 +35,7 @@ Once you've accepted, you should be redirected back to dbt Cloud, and you'll see dbt Cloud enterprise customers have the added benefit of bringing their own GitLab OAuth application to dbt Cloud. This tier benefits from extra security, as dbt Cloud will: - Enforce user authorization with OAuth. -- Carry GitLab's user repository permissions (read / write access) through to dbt Cloud IDE's git actions. +- Carry GitLab's user repository permissions (read / write access) through to dbt Cloud or dbt Cloud CLI's git actions. In order to connect GitLab in dbt Cloud, a GitLab account admin must: 1. [Set up a GitLab OAuth application](#setting-up-a-gitlab-oauth-application). @@ -71,6 +71,8 @@ The application form in GitLab should look as follows when completed: Click **Save application** in GitLab, and GitLab will then generate an **Application ID** and **Secret**. These values will be available even if you close the app screen, so this is not the only chance you have to save them. +If you're a Business Critical customer using [IP restrictions](/docs/cloud/secure/ip-restrictions), ensure you've added the appropriate Gitlab CIDRs to your IP restriction rules, or else the Gitlab connection will fail. + ### Adding the GitLab OAuth application to dbt Cloud After you've created your GitLab application, you need to provide dbt Cloud information about the app. In dbt Cloud, account admins should navigate to **Account Settings**, click on the **Integrations** tab, and expand the GitLab section. @@ -95,7 +97,7 @@ You will then be redirected to GitLab and prompted to sign into your account. Gi Once you've accepted, you should be redirected back to dbt Cloud, and your integration is ready for developers on your team to [personally authenticate with](#personally-authenticating-with-gitlab). ### Personally authenticating with GitLab -dbt Cloud developers on the Enterprise plan must each connect their GitLab profiles to dbt Cloud, as every developer's read / write access for the dbt repo is checked in the dbt Cloud IDE. +dbt Cloud developers on the Enterprise plan must each connect their GitLab profiles to dbt Cloud, as every developer's read / write access for the dbt repo is checked in the dbt Cloud IDE or dbt Cloud CLI. To connect a personal GitLab account, dbt Cloud developers should navigate to Your Profile settings by clicking the gear icon in the top right, then select **Linked Accounts** in the left menu. @@ -103,7 +105,7 @@ If your GitLab account is not connected, you’ll see "No connected account". Se -Once you approve authorization, you will be redirected to dbt Cloud, and you should see your connected account. You're now ready to start developing in the dbt Cloud IDE. +Once you approve authorization, you will be redirected to dbt Cloud, and you should see your connected account. You're now ready to start developing in the dbt Cloud IDE or dbt Cloud CLI. ## Troubleshooting @@ -122,3 +124,4 @@ If you imported a repository using the dbt Cloud native integration with GitLab, + diff --git a/website/docs/docs/cloud/git/git-configuration-in-dbt-cloud.md b/website/docs/docs/cloud/git/git-configuration-in-dbt-cloud.md new file mode 100644 index 00000000000..fb8c0186236 --- /dev/null +++ b/website/docs/docs/cloud/git/git-configuration-in-dbt-cloud.md @@ -0,0 +1,37 @@ +--- +title: "Git configuration in dbt Cloud" +description: "Learn about the Git providers supported in dbt Cloud" +pagination_next: "docs/cloud/git/import-a-project-by-git-url" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + +
\ No newline at end of file diff --git a/website/docs/docs/cloud/git/import-a-project-by-git-url.md b/website/docs/docs/cloud/git/import-a-project-by-git-url.md index d84eb99dab8..83846bb1f0b 100644 --- a/website/docs/docs/cloud/git/import-a-project-by-git-url.md +++ b/website/docs/docs/cloud/git/import-a-project-by-git-url.md @@ -1,6 +1,8 @@ --- title: "Import a project by git URL" id: "import-a-project-by-git-url" +pagination_next: "docs/cloud/git/connect-github" +pagination_prev: null --- In dbt Cloud, you can import a git repository from any valid git URL that points to a dbt project. There are some important considerations to keep in mind when doing this. @@ -125,3 +127,7 @@ Don't see your git provider here? Please [contact dbt Support](mailto:support@ge ## Limited integration Some features of dbt Cloud require a tight integration with your git host, for example, updating GitHub pull requests with dbt Cloud run statuses. Importing your project by a URL prevents you from using these features. Once you give dbt Cloud access to your repository, you can continue to set up your project by adding a connection and creating and running your first dbt Cloud job. + +## FAQs + + diff --git a/website/docs/docs/cloud/git/setup-azure.md b/website/docs/docs/cloud/git/setup-azure.md index 9eca77d7014..843371be6ea 100644 --- a/website/docs/docs/cloud/git/setup-azure.md +++ b/website/docs/docs/cloud/git/setup-azure.md @@ -93,7 +93,7 @@ Once you connect your Azure AD app and Azure DevOps, you need to provide dbt Clo - **Directory(tenant) ID:** Found in the Azure AD App. -Your Azure AD app should now be added to your dbt Cloud Account. People on your team who want to develop in dbt Cloud's IDE can now personally [authorize Azure DevOps from their profiles](/docs/cloud/git/authenticate-azure). +Your Azure AD app should now be added to your dbt Cloud Account. People on your team who want to develop in the dbt Cloud IDE or dbt Cloud CLI can now personally [authorize Azure DevOps from their profiles](/docs/cloud/git/authenticate-azure). ## Connect a service user diff --git a/website/docs/docs/cloud/manage-access/about-access.md b/website/docs/docs/cloud/manage-access/about-access.md index 9a95d0aeb68..d394c79baa3 100644 --- a/website/docs/docs/cloud/manage-access/about-access.md +++ b/website/docs/docs/cloud/manage-access/about-access.md @@ -2,6 +2,8 @@ title: "About user access in dbt Cloud" description: "Learn how dbt Cloud administrators can use dbt Cloud's permissioning model to control user-level access in a dbt Cloud account." id: "about-user-access" +pagination_next: "docs/cloud/manage-access/seats-and-users" +pagination_prev: null --- :::info "User access" is not "Model access" @@ -121,12 +123,6 @@ set on the _Internal Analytics_ project. ### Manual assignment - - -- New in version 1.1.23 (March, 2021) - - - dbt Cloud administrators can manually assign users to groups independently of IdP attributes. If a dbt Cloud group is configured _without_ any SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust diff --git a/website/docs/docs/cloud/manage-access/audit-log.md b/website/docs/docs/cloud/manage-access/audit-log.md index 818ec553e7b..b90bceef570 100644 --- a/website/docs/docs/cloud/manage-access/audit-log.md +++ b/website/docs/docs/cloud/manage-access/audit-log.md @@ -3,6 +3,8 @@ title: "The audit log for dbt Cloud Enterprise" id: audit-log description: "You can troubleshoot possible issues and provide security audits by reviewing event activity in your organization." sidebar_label: "Audit log" +pagination_next: null +pagination_prev: "docs/cloud/manage-access/about-user-access" --- To review actions performed by people in your organization, dbt provides logs of audited user and system events in real time. The audit log appears as events happen and includes details such as who performed the action, what the action was, and when it was performed. You can use these details to troubleshoot access issues, perform security audits, or analyze specific events. @@ -16,13 +18,9 @@ The dbt Cloud audit log stores all the events that occurred in your organization ## Accessing the audit log -To access audit log, click the gear icon in the top right, then click **Audit Log**. +To access the audit log, click the gear icon in the top right, then click **Audit Log**. -
- - - -
+ ## Understanding the audit log @@ -161,19 +159,17 @@ The audit log supports various events for different objects in dbt Cloud. You wi You can search the audit log to find a specific event or actor, which is limited to the ones listed in [Events in audit log](#events-in-audit-log). The audit log successfully lists historical events spanning the last 90 days. You can search for an actor or event using the search bar, and then narrow your results using the time window. -
- + -
## Exporting logs You can use the audit log to export all historical audit results for security, compliance, and analysis purposes: -- For events within 90 days — dbt Cloud will automatically display the 90 days selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. +- For events within 90 days — dbt Cloud will automatically display the 90-day selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. - For events beyond 90 days — Select **Export All**. The Account Admin will receive an email link to download a CSV file of all the events that occurred in your organization. - + diff --git a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md index baa92b5a98f..24c64a5abed 100644 --- a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md +++ b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md @@ -3,25 +3,29 @@ title: "Users and licenses" description: "Learn how dbt Cloud administrators can use licenses and seats to control access in a dbt Cloud account." id: "seats-and-users" sidebar: "Users and licenses" +pagination_next: "docs/cloud/manage-access/self-service-permissions" +pagination_prev: null --- In dbt Cloud, _licenses_ are used to allocate users to your account. There are three different types of licenses in dbt Cloud: - **Developer** — Granted access to the Deployment and [Development](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) functionality in dbt Cloud. -- **Read-Only** — Intended to view the [artifacts](/docs/deploy/artifacts) created in a dbt Cloud account. -- **IT** — Can manage users, groups, and licenses, among other permissions. Available on Enterprise and Team plans only. +- **Read-Only** — Intended to view the [artifacts](/docs/deploy/artifacts) created in a dbt Cloud account. Read-Only users can receive job notifications but not configure them. +- **IT** — Can manage users, groups, and licenses, among other permissions. IT users can receive job notifications but not configure them. Available on Enterprise and Team plans only. The user's assigned license determines the specific capabilities they can access in dbt Cloud. | Functionality | Developer User | Read-Only Users | IT Users* | | ------------- | -------------- | --------------- | -------- | -| Use the Developer IDE | ✅ | ❌ | ❌ | +| Use the dbt Cloud IDE | ✅ | ❌ | ❌ | +| Use the dbt Cloud CLI | ✅ | ❌ | ❌ | | Use Jobs | ✅ | ❌ | ❌ | | Manage Account | ✅ | ❌ | ✅ | | API Access | ✅ | ❌ | ❌ | | Use [Source Freshness](/docs/deploy/source-freshness) | ✅ | ✅ | ❌ | | Use [Docs](/docs/collaborate/build-and-view-your-docs) | ✅ | ✅ | ❌ | -*Available on Enterprise and Team plans only and doesn't count toward seat usage. +| Receive [Job notifications](/docs/deploy/job-notifications) | ✅ | ✅ | ✅ | +*Available on Enterprise and Team plans only and doesn't count toward seat usage. Please note, that IT seats are limited to 1 seat per Team or Enterprise account. ## Licenses diff --git a/website/docs/docs/cloud/manage-access/enterprise-permissions.md b/website/docs/docs/cloud/manage-access/enterprise-permissions.md index cb338b3dc39..dcacda20deb 100644 --- a/website/docs/docs/cloud/manage-access/enterprise-permissions.md +++ b/website/docs/docs/cloud/manage-access/enterprise-permissions.md @@ -3,6 +3,7 @@ title: "Enterprise permissions" id: "enterprise-permissions" description: "Permission sets for Enterprise plans." hide_table_of_contents: true #For the sake of the tables on this page +pagination_next: null --- import Permissions from '/snippets/_enterprise-permissions-table.md'; @@ -21,11 +22,7 @@ The following roles and permission sets are available for assignment in dbt Clou -## Diagram of the Permission Sets - - - -## How to Set Up RBAC Groups in dbt Cloud +## How to set up RBAC Groups in dbt Cloud Role-Based Access Control (RBAC) is helpful for automatically assigning permissions to dbt admins based on their SSO provider group associations. @@ -34,7 +31,7 @@ Role-Based Access Control (RBAC) is helpful for automatically assigning permissi 1. Select an existing group or create a new group to add RBAC. Name the group (this can be any name you like, but it's recommended to keep it consistent with the SSO groups). If you have configured SSO with SAML 2.0, you may have to use the GroupID instead of the name of the group. -2. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case sensitive and must match the source group formatting. +2. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case-sensitive and must match the source group formatting. 3. Configure the permissions for users within those groups by clicking **Add** in the **Access** section of the window. diff --git a/website/docs/docs/cloud/manage-access/licenses-and-groups.md b/website/docs/docs/cloud/manage-access/licenses-and-groups.md index 88d64f2d9a3..83b926c7445 100644 --- a/website/docs/docs/cloud/manage-access/licenses-and-groups.md +++ b/website/docs/docs/cloud/manage-access/licenses-and-groups.md @@ -117,12 +117,6 @@ set on the _Internal Analytics_ project. ### Manual assignment - - -- New in version 1.1.23 (March, 2021) - - - dbt Cloud administrators can manually assign users to groups independently of IdP attributes. If a dbt Cloud group is configured _without_ any SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust diff --git a/website/docs/docs/cloud/manage-access/self-service-permissions.md b/website/docs/docs/cloud/manage-access/self-service-permissions.md index 21cc765b76d..d3c9cf8f5ea 100644 --- a/website/docs/docs/cloud/manage-access/self-service-permissions.md +++ b/website/docs/docs/cloud/manage-access/self-service-permissions.md @@ -12,7 +12,8 @@ The permissions afforded to each role are described below: | ------ | ------ | ----- | | View and edit resources | ✅ | ✅ | | Trigger runs | ✅ | ✅ | -| Access the IDE | ✅ | ✅ | +| Access the dbt Cloud IDE | ✅ | ✅ | +| Access the dbt Cloud CLI | ✅ | ✅ | | Invite Members to the account | ✅ | ✅ | | Manage billing | ❌ | ✅ | | Manage team permissions | ❌ | ✅ | diff --git a/website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md b/website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md index 516a340c951..87018b14d56 100644 --- a/website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md +++ b/website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md @@ -1,7 +1,8 @@ --- title: "Set up BigQuery OAuth" -description: "Learn how dbt Cloud administrators can use licenses and seats to control access in a dbt Cloud account." +description: "Learn how dbt Cloud administrators can use BigQuery OAuth to control access in a dbt Cloud account" id: "set-up-bigquery-oauth" +pagination_next: null --- :::info Enterprise Feature @@ -73,3 +74,7 @@ You will then be redirected to BigQuery and asked to approve the drive, cloud pl Select **Allow**. This redirects you back to dbt Cloud. You should now be an authenticated BigQuery user, ready to use the dbt Cloud IDE. + +## FAQs + + diff --git a/website/docs/docs/cloud/manage-access/set-up-databricks-oauth.md b/website/docs/docs/cloud/manage-access/set-up-databricks-oauth.md new file mode 100644 index 00000000000..679133b7844 --- /dev/null +++ b/website/docs/docs/cloud/manage-access/set-up-databricks-oauth.md @@ -0,0 +1,77 @@ +--- +title: "Set up Databricks OAuth" +description: "Learn how dbt Cloud administrators can use Databricks OAuth to control access in a dbt Cloud account." +id: "set-up-databricks-oauth" +--- + +:::info Enterprise Feature + +This guide describes a feature of the dbt Cloud Enterprise plan. If you’re interested in learning more about an Enterprise plan, contact us at sales@getdbt.com. + +::: + +dbt Cloud supports developer OAuth ([OAuth for partner solutions](https://docs.databricks.com/en/integrations/manage-oauth.html)) with Databricks, providing an additional layer of security for dbt enterprise users. When you enable Databricks OAuth for a dbt Cloud project, all dbt Cloud developers must authenticate with Databricks in order to use the dbt Cloud IDE. The project's deployment environments will still leverage the Databricks authentication method set at the environment level. + +:::tip Beta Feature + +Databricks OAuth support in dbt Cloud is a [beta feature](/docs/dbt-versions/product-lifecycles#dbt-cloud) and subject to change without notification. More updates to this feature coming soon. + +Current limitations: +- Databrick's OAuth applications are in public preview +- The current experience requires the IDE to be restarted every hour (access tokens expire after 1 hour - [workaround](https://docs.databricks.com/en/integrations/manage-oauth.html#override-the-default-token-lifetime-policy-for-dbt-core-power-bi-or-tableau-desktop)) + +::: + +### Configure Databricks OAuth (Databricks admin) + +To get started, you will need to [add dbt as an OAuth application](https://docs.databricks.com/en/integrations/configure-oauth-dbt.html) with Databricks, in 2 steps: + +1. From your terminal, [authenticate to the Databricks Account API](https://docs.databricks.com/en/integrations/configure-oauth-dbt.html#authenticate-to-the-account-api) with the Databricks CLI. You authenticate using: + - OAuth for users ([prerequisites](https://docs.databricks.com/en/dev-tools/auth.html#oauth-u2m-auth)) + - Oauth for service principals ([prerequisites](https://docs.databricks.com/en/dev-tools/auth.html#oauth-m2m-auth)) + - Username and password (must be account admin) +2. In the same terminal, **add dbt Cloud as an OAuth application** using `curl` and the [OAuth Custom App Integration API](https://docs.databricks.com/api/account/customappintegration/create) + +For the second step, you can use this example `curl` to authenticate with your username and password, replacing values as defined in the following table: + +```shell +curl -u USERNAME:PASSWORD https://accounts.cloud.databricks.com/api/2.0/accounts/ACCOUNT_ID/oauth2/custom-app-integrations -d '{"redirect_urls": ["https://YOUR_ACCESS_URL", "https://YOUR_ACCESS_URL/complete/databricks"], "confidential": true, "name": "NAME", "scopes": ["sql", "offline_access"]}' +``` + +These parameters and descriptions will help you authenticate with your username and password: + +| Parameter | Description | +| ------ | ----- | +| **USERNAME** | Your Databricks username (account admin level) | +| **PASSWORD** | Your Databricks password (account admin level) | +| **ACCOUNT_ID** | Your Databricks [account ID](https://docs.databricks.com/en/administration-guide/account-settings/index.html#locate-your-account-id) | +| **YOUR_ACCESS_URL** | The [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your dbt Cloud account region and plan | +| **NAME** | The integration name (i.e 'databricks-dbt-cloud') + +After running the `curl`, you'll get an API response that includes the `client_id` and `client_secret` required in the following section. At this time, this is the only way to retrieve the secret. If you lose the secret, then the integration needs to be [deleted](https://docs.databricks.com/api/account/customappintegration/delete) and re-created. + + +### Configure the Connection in dbt Cloud (dbt Cloud project admin) + +Now that you have an OAuth app set up in Databricks, you'll need to add the client ID and secret to dbt Cloud. To do so: + - go to Settings by clicking the gear in the top right. + - on the left, select **Projects** under **Account Settings** + - choose your project from the list + - select **Connection** to edit the connection details + - add the `OAuth Client ID` and `OAuth Client Secret` from the Databricks OAuth app under the **Optional Settings** section + + + +### Authenticating to Databricks (dbt Cloud IDE developer) + +Once the Databricks connection via OAuth is set up for a dbt Cloud project, each dbt Cloud user will need to authenticate with Databricks in order to use the IDE. To do so: + +- Click the gear icon at the top right and select **Profile settings**. +- Select **Credentials**. +- Choose your project from the list +- Select `OAuth` as the authentication method, and click **Save** +- Finalize by clicking the **Connect Databricks Account** button + + + +You will then be redirected to Databricks and asked to approve the connection. This redirects you back to dbt Cloud. You should now be an authenticated Databricks user, ready to use the dbt Cloud IDE. diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md b/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md index f58bceff816..28d20b526db 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md @@ -144,9 +144,9 @@ To complete setup, follow the steps below in the dbt Cloud application. | ----- | ----- | | **Log in with** | Azure AD Single Tenant | | **Client ID** | Paste the **Application (client) ID** recorded in the steps above | -| **Client Secret** | Paste the **Client Secret** (remember to use the Secret Value instead of the Secret ID) recorded in the steps above | +| **Client Secret** | Paste the **Client Secret** (remember to use the Secret Value instead of the Secret ID) recorded in the steps above;
**Note:** When the client secret expires, an Azure AD admin will have to generate a new one to be pasted into dbt Cloud for uninterrupted application access. | | **Tenant ID** | Paste the **Directory (tenant ID)** recorded in the steps above | -| **Domain** | Enter the domain name for your Azure directory (eg. `fishtownanalytics.com`). Only users with accounts in this directory with this primary domain will be able to log into the dbt Cloud application. Optionally, you may specify a CSV of domains which are _all_ authorized to access your dbt Cloud account (eg. `fishtownanalytics.com, fishtowndata.com`) Ensure that the domain(s) match the values configured on user accounts in Azure | +| **Domain** | Enter the domain name for your Azure directory (such as `fishtownanalytics.com`). Only use the primary domain; this won't block access for other domains. | | **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/manage-access/sso-overview#auth0-multi-tenant-uris) for your region and plan. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md index a206d359270..19779baf615 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md @@ -49,7 +49,7 @@ Client Secret for use in dbt Cloud. | **Application type** | internal | required | | **Application name** | dbt Cloud | required | | **Application logo** | Download the logo here | optional | -| **Authorized domains** | `getdbt.com` (US) `dbt.com` (EMEA or AU) | If deploying into a VPC, use the domain for your deployment | +| **Authorized domains** | `getdbt.com` (US multi-tenant) `getdbt.com` and `dbt.com`(US Cell 1) `dbt.com` (EMEA or AU) | If deploying into a VPC, use the domain for your deployment | | **Scopes** | `email, profile, openid` | The default scopes are sufficient | diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md index 0d493bcf29f..4079cc488c4 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md @@ -16,8 +16,6 @@ dbt Cloud Enterprise supports single-sign on via Okta (using SAML). Currently su * Just-in-time provisioning This guide outlines the setup process for authenticating to dbt Cloud with Okta. -If you have any questions during the setup process, please contact support -(support@getdbt.com) for assistance. ## Configuration in Okta @@ -63,7 +61,7 @@ Click **Next** to continue. ### Configure SAML Settings -The SAML Settings page configures how Okta and dbt Cloud communicate. You will want to use an [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. If you aren't sure which values you should use, please contact support (support@getdbt.com). +The SAML Settings page configures how Okta and dbt Cloud communicate. You will want to use an [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. To complete this section, you will need a _login slug_. This slug controls the URL where users on your account can log into your application via Okta. Login @@ -95,9 +93,9 @@ Expected **User Attribute Statements**: | Name | Name format | Value | Description | | -------------- | ----------- | -------------------- | -------------------------- | -| `email` | Unspecified | `${user.email}` | _The user's email address_ | -| `first_name` | Unspecified | `${user.firstName}` | _The user's first name_ | -| `last_name` | Unspecified | `${user.lastName}` | _The user's last name_ | +| `email` | Unspecified | `user.email` | _The user's email address_ | +| `first_name` | Unspecified | `user.firstName` | _The user's first name_ | +| `last_name` | Unspecified | `user.lastName` | _The user's last name_ | Expected **Group Attribute Statements**: @@ -173,7 +171,7 @@ configured in the steps above. | **Log in with** | Okta | | **Identity Provider SSO Url** | Paste the **Identity Provider Single Sign-On URL** shown in the Okta setup instructions | | **Identity Provider Issuer** | Paste the **Identity Provider Issuer** shown in the Okta setup instructions | -| **X.509 Certificate** | Paste the **X.509 Certificate** shown in the Okta setup instructions | +| **X.509 Certificate** | Paste the **X.509 Certificate** shown in the Okta setup instructions;
**Note:** When the certificate expires, an Okta admin will have to generate a new one to be pasted into dbt Cloud for uninterrupted application access. | | **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | **Note:** When the certificate expires, an Idp admin will have to generate a new one to be pasted into dbt Cloud for uninterrupted application access. | | Slug | Enter your desired login slug. | diff --git a/website/docs/docs/cloud/manage-access/sso-overview.md b/website/docs/docs/cloud/manage-access/sso-overview.md index 7e44859c73a..f613df7907e 100644 --- a/website/docs/docs/cloud/manage-access/sso-overview.md +++ b/website/docs/docs/cloud/manage-access/sso-overview.md @@ -1,7 +1,8 @@ --- -title: "SSO Overview" +title: "Single sign-on (SSO) Overview" id: "sso-overview" - +pagination_next: "docs/cloud/manage-access/set-up-sso-saml-2.0" +pagination_prev: null --- This overview explains how users are provisioned in dbt Cloud via Single Sign-On (SSO). diff --git a/website/docs/docs/cloud/secure/about-privatelink.md b/website/docs/docs/cloud/secure/about-privatelink.md index 77ee8a6af7a..b31e4c08a26 100644 --- a/website/docs/docs/cloud/secure/about-privatelink.md +++ b/website/docs/docs/cloud/secure/about-privatelink.md @@ -5,6 +5,10 @@ description: "Configuring PrivateLink for AWS" sidebar_label: "About PrivateLink" --- +import SetUpPages from '/snippets/_available-tiers-privatelink.md'; + + + PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on AWS using [AWS PrivateLink](https://aws.amazon.com/privatelink/) technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. ### Cross-region PrivateLink @@ -15,6 +19,7 @@ dbt Labs has a worldwide network of regional VPCs. These VPCs are specifically u dbt Cloud supports the following data platforms for use with the PrivateLink feature. Instructions for enabling PrivateLink for the various data platform providers are unique. The following guides will walk you through the necessary steps, including working with [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support) to complete the connection in the dbt private network and setting up the endpoint in dbt Cloud. -- [Redshift](/docs/cloud/secure/redshift-privatelink) - [Snowflake](/docs/cloud/secure/snowflake-privatelink) - [Databricks](/docs/cloud/secure/databricks-privatelink) +- [Redshift](/docs/cloud/secure/redshift-privatelink) +- [Postgres](/docs/cloud/secure/postgres-privatelink) diff --git a/website/docs/docs/cloud/secure/databricks-privatelink.md b/website/docs/docs/cloud/secure/databricks-privatelink.md index c136cd8a0f9..2311bdf2e6e 100644 --- a/website/docs/docs/cloud/secure/databricks-privatelink.md +++ b/website/docs/docs/cloud/secure/databricks-privatelink.md @@ -3,8 +3,13 @@ title: "Configuring Databricks PrivateLink" id: databricks-privatelink description: "Configuring PrivateLink for Databricks" sidebar_label: "PrivateLink for Databricks" +pagination_next: null --- +import SetUpPages from '/snippets/_available-tiers-privatelink.md'; + + + The following steps will walk you through the setup of a Databricks AWS PrivateLink endpoint in the dbt Cloud multi-tenant environment. ## Configure PrivateLink diff --git a/website/docs/docs/cloud/secure/ip-restrictions.md b/website/docs/docs/cloud/secure/ip-restrictions.md index dacd0c885c4..034b3a6c144 100644 --- a/website/docs/docs/cloud/secure/ip-restrictions.md +++ b/website/docs/docs/cloud/secure/ip-restrictions.md @@ -3,6 +3,8 @@ title: "Configuring IP restrictions" id: ip-restrictions description: "Configuring IP restrictions to outside traffic from accessing your dbt Cloud environment" sidebar_label: "IP restrictions" +pagination_next: "docs/cloud/secure/about-privatelink" +pagination_prev: null --- import SetUpPages from '/snippets/_available-tiers-iprestrictions.md'; @@ -19,7 +21,9 @@ To configure IP restrictions, go to **Account Settings** → **IP Restrictions** - Deny IPs flagged by the Security team - Allow only VPN traffic but make an exception for contractors’ IP addresses -IP restrictions will block all user requests done via the API (via personal user token) and the UI. Service tokens are exempt from IP restrictions and can still make requests to dbt Cloud API. +IP restrictions will block all service tokens, user requests done via the API (via personal user token), and the UI if they come from blocked IP addresses. + +For any version control system integrations (Github, Gitlab, ADO, etc.) inbound into dbt Cloud, ensure their IP addresses are added to the allowed list. ### Allowing IPs @@ -32,7 +36,7 @@ To add an IP to the allowlist, from the **IP Restrictions** page: 4. Select **Allow** 5. Add the ranges in the CIDR notation - For example, 1.1.1.1/8 - - You can add multiple ranges followed by commas + - You can add multiple ranges in the same rule. 6. Click **Save** Note that simply adding the IP Ranges will not enforce IP restrictions. For more information, see the section “Enabling Restrictions.” diff --git a/website/docs/docs/cloud/secure/postgres-privatelink.md b/website/docs/docs/cloud/secure/postgres-privatelink.md new file mode 100644 index 00000000000..ef07d15c128 --- /dev/null +++ b/website/docs/docs/cloud/secure/postgres-privatelink.md @@ -0,0 +1,79 @@ +--- +title: "Configure AWS PrivateLink for Postgres" +id: postgres-privatelink +description: "Configuring PrivateLink for Postgres" +sidebar_label: "PrivateLink for Postgres" +--- +import SetUpPages from '/snippets/_available-tiers-privatelink.md'; + + + +A Postgres database, hosted either in AWS or in a properly connected on-prem data center, can be accessed through a private network connection using AWS Interface-type PrivateLink. The type of Target Group connected to the Network Load Balancer (NLB) may vary based on the location and type of Postgres instance being connected, as explained in the following steps. + +## Configuring Postgres interface-type PrivateLink + +### 1. Provision AWS resources + +Creating an Interface VPC PrivateLink connection requires creating multiple AWS resources in the account containing, or connected to, the Postgres instance: + +- **Security Group (AWS hosted only)** — If you are connecting to an existing Postgres instance, this likely already exists, however, you may need to add or modify Security Group rules to accept traffic from the Network Load Balancer (NLB) created for this Endpoint Service. +- **Target Group** — The Target Group will be attached to the NLB to tell it where to route requests. There are various target types available for NLB Target Groups, so choose the one appropriate for your Postgres setup. + + - Target Type: + + - _[Amazon RDS for PostgreSQL](https://aws.amazon.com/rds/postgresql/)_ - **IP** + + - Find the IP address of your RDS instance using a command line tool such as `nslookup ` or `dig +short ` with your RDS DNS endpoint + + - _Note_: With RDS Multi-AZ failover capabilities the IP address of your RDS instance can change, at which point your Target Group would need to be updated. See [this AWS blog post](https://aws.amazon.com/blogs/database/access-amazon-rds-across-vpcs-using-aws-privatelink-and-network-load-balancer/) for more details and a possible solution. + + - _On-prem Postgres server_ - **IP** + + - Use the IP address of the on-prem Postgres server linked to AWS through AWS Direct Connect or a Site-to-Site VPN connection + + - _Postgres on EC2_ - **Instance/ASG** (or **IP**) + + - If your Postgres instance is hosted on EC2 the _instance_ Target Group type (or ideally [using the instance type to connect to an auto-scaling group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/attach-load-balancer-asg.html)) can be used to attach the instance without needing a static IP address + + - The IP type can also be used, with the understanding that the IP of the EC2 instance can change if the instance is relaunched for any reason + + - Target Group protocol: **TCP** + +- **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5432` +- **VPC Endpoint Service** — Attach to the newly created NLB. + - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + +### 2. Grant dbt AWS account access to the VPC Endpoint Service + +On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. + + - Principal: `arn:aws:iam::346425330055:role/MTPL_Admin` + + + +### 3. Obtain VPC Endpoint Service Name + +Once the VPC Endpoint Service is provisioned, you can find the service name in the AWS console by navigating to **VPC** → **Endpoint Services** and selecting the appropriate endpoint service. You can copy the service name field value and include it in your communication to dbt Cloud support. + + + +### 4. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Postgres Interface-type +- VPC Endpoint Service Name: +- Postgres server AWS Region (e.g., us-east-1, eu-west-2): +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` + +dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the endpoint is available. + +## Create Connection in dbt Cloud + +Once dbt Cloud support completes the configuration, you can start creating new connections using PrivateLink. + +1. Navigate to **settings** → **Create new project** → select **PostgreSQL** +2. You will see two radio buttons: **Public** and **Private.** Select **Private**. +3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). +4. Configure the remaining data platform details. +5. Test your connection and save it. diff --git a/website/docs/docs/cloud/secure/redshift-privatelink.md b/website/docs/docs/cloud/secure/redshift-privatelink.md index fc0ceeea334..c42c703556b 100644 --- a/website/docs/docs/cloud/secure/redshift-privatelink.md +++ b/website/docs/docs/cloud/secure/redshift-privatelink.md @@ -5,6 +5,10 @@ description: "Configuring PrivateLink for Redshift" sidebar_label: "PrivateLink for Redshift" --- +import SetUpPages from '/snippets/_available-tiers-privatelink.md'; + + + AWS provides two different ways to create a PrivateLink VPC endpoint for a Redshift cluster that is running in another VPC: - [Redshift-managed PrivateLink Endpoints](https://docs.aws.amazon.com/redshift/latest/mgmt/managing-cluster-cross-vpc.html) - [Redshift Interface-type PrivateLink Endpoints](https://docs.aws.amazon.com/redshift/latest/mgmt/security-private-link.html) @@ -79,7 +83,7 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. - - Principal: `arn:aws:iam::346425330055:root` + - Principal: `arn:aws:iam::346425330055:role/MTPL_Admin` diff --git a/website/docs/docs/cloud/secure/secure-your-tenant.md b/website/docs/docs/cloud/secure/secure-your-tenant.md new file mode 100644 index 00000000000..95cb8adffba --- /dev/null +++ b/website/docs/docs/cloud/secure/secure-your-tenant.md @@ -0,0 +1,49 @@ +--- +title: "Secure your tenant" +description: "Learn how to secure your tenant for dbt Cloud" +pagination_next: "docs/cloud/secure/ip-restrictions" +pagination_prev: null +--- + +
+ + + + + + + +
+
+
+ + + + + + +
\ No newline at end of file diff --git a/website/docs/docs/cloud/secure/snowflake-privatelink.md b/website/docs/docs/cloud/secure/snowflake-privatelink.md index bbbdf04ddf0..dd046259e4e 100644 --- a/website/docs/docs/cloud/secure/snowflake-privatelink.md +++ b/website/docs/docs/cloud/secure/snowflake-privatelink.md @@ -5,6 +5,10 @@ description: "Configuring PrivateLink for Snowflake" sidebar_label: "PrivateLink for Snowflake" --- +import SetUpPages from '/snippets/_available-tiers-privatelink.md'; + + + The following steps will walk you through the setup of a Snowflake AWS PrivateLink endpoint in the dbt Cloud multi-tenant environment. :::note Snowflake SSO with PrivateLink diff --git a/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md b/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md index 36f4781bfde..b387c64788f 100644 --- a/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md +++ b/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md @@ -2,6 +2,7 @@ title: "Build and view your docs with dbt Cloud" id: "build-and-view-your-docs" description: "Automatically generate project documentation as you run jobs." +pagination_next: null --- dbt enables you to generate documentation for your project and data warehouse, and renders the documentation in a website. For more information, see [Documentation](/docs/collaborate/documentation). @@ -39,16 +40,17 @@ To create and schedule documentation-only jobs at the end of your production job You configure project documentation to generate documentation when the job you set up in the previous section runs. In the project settings, specify the job that generates documentation artifacts for that project. Once you configure this setting, subsequent runs of the job will automatically include a step to generate documentation. 1. Click the gear icon in the top right. -2. Select **Projects** and click the project that needs documentation. -3. Click **Edit**. -4. Under "Artifacts," select the job that should generate docs when it runs. +2. Select **Account Settings**. +3. Navigate to **Projects** and select the project that needs documentation. +4. Click **Edit**. +5. Under **Artifacts**, select the job that should generate docs when it runs. -5. Click **Save**. +6. Click **Save**. ## Generating documentation -To generate documentation in the IDE, run the `dbt docs generate` command in the -Command Bar in the IDE. This command will generate the Docs for your dbt project as it exists in development in your IDE session. +To generate documentation in the dbt Cloud IDE, run the `dbt docs generate` command in the +Command Bar in the dbt Cloud IDE. This command will generate the Docs for your dbt project as it exists in development in your IDE session. diff --git a/website/docs/docs/collaborate/collaborate-with-others.md b/website/docs/docs/collaborate/collaborate-with-others.md new file mode 100644 index 00000000000..7875a8044b6 --- /dev/null +++ b/website/docs/docs/collaborate/collaborate-with-others.md @@ -0,0 +1,38 @@ +--- +title: "Collaborate with others" +description: "Learn how dbt Cloud makes it easier to collaborate with others" +pagination_next: "docs/collaborate/explore-projects" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + + +
\ No newline at end of file diff --git a/website/docs/docs/collaborate/documentation.md b/website/docs/docs/collaborate/documentation.md index b613fd7a5ef..16a4e610c70 100644 --- a/website/docs/docs/collaborate/documentation.md +++ b/website/docs/docs/collaborate/documentation.md @@ -2,6 +2,8 @@ title: "About documentation" description: "Learn how good documentation for your dbt models helps stakeholders discover and understand your datasets." id: "documentation" +pagination_next: "docs/collaborate/build-and-view-your-docs" +pagination_prev: null --- ## Related documentation @@ -9,7 +11,7 @@ id: "documentation" * [Declaring properties](/reference/configs-and-properties) * [`dbt docs` command](/reference/commands/cmd-docs) * [`doc` Jinja function](/reference/dbt-jinja-functions) -* If you're new to dbt, we recommend that you check out our [quickstart guide](/quickstarts) to build your first dbt project, complete with documentation. +* If you're new to dbt, we recommend that you check out our [quickstart guide](/guides) to build your first dbt project, complete with documentation. ## Assumed knowledge @@ -147,7 +149,6 @@ as well as the repo for this project \[here](https://github.com/dbt-labs/mrr-pla ### Custom project-level overviews -New in v0.18.0 You can set different overviews for each dbt project/package included in your documentation site by creating a docs block named `__[project_name]__`. For example, in order to define diff --git a/website/docs/docs/collaborate/explore-projects.md b/website/docs/docs/collaborate/explore-projects.md new file mode 100644 index 00000000000..282ef566356 --- /dev/null +++ b/website/docs/docs/collaborate/explore-projects.md @@ -0,0 +1,236 @@ +--- +title: "Explore your dbt projects" +sidebar_label: "Explore dbt projects" +description: "Learn about dbt Explorer and how to interact with it to understand, improve, and leverage your data pipelines." +pagination_next: null +pagination_prev: null +--- + +With dbt Explorer, you can view your project's [resources](/docs/build/projects) (such as models, tests, and metrics) and their lineage to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. + +:::tip Public preview + +Try dbt Explorer! It's available in [Public Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud) as of October 17, 2023 for dbt Cloud customers. More updates coming soon. + +::: + +## Prerequisites + +- You have a [multi-tenant](/docs/cloud/about-cloud/tenancy#multi-tenant) or AWS single-tenant dbt Cloud account on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You have set up a [production deployment environment](/docs/deploy/deploy-environments#set-as-production-environment) for each project you want to explore. + - There has been at least one successful job run in the production deployment environment. +- You are on the dbt Explorer page. To do this, select **Explore** from the top navigation bar in dbt Cloud. + + +## Generate metadata + +dbt Explorer uses the metadata provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-api) to display the details about [the state of your project](/docs/dbt-cloud-apis/project-state). The metadata that's available depends on the [deployment environment](/docs/deploy/deploy-environments) you've designated as _production_ in your dbt Cloud project. dbt Explorer automatically retrieves the metadata updates after each job run in the production deployment environment so it always has the latest results for your project. + +To view a resource and its metadata, you must define the resource in your project and run a job in the production environment. The resulting metadata depends on the [commands executed by the jobs](/docs/deploy/job-commands). + +For a richer experience with dbt Explorer, you must: + +- Run [dbt run](/reference/commands/run) or [dbt build](/reference/commands/build) on a given model within a job in the environment to update model details or results. +- Run [dbt docs generate](/reference/commands/cmd-docs) within a job in the environment to view catalog statistics and columns for models, sources, and snapshots. +- Run [dbt test](/reference/commands/test) or [dbt build](/reference/commands/build) within a job in the environment to view test results. +- Run [dbt source freshness](/reference/commands/source#dbt-source-freshness) within a job in the environment to view source freshness data. +- Run [dbt snapshot](/reference/commands/snapshot) or [dbt build](/reference/commands/build) within a job in the environment to view snapshot details. + +Richer and more timely metadata will become available as dbt, the Discovery API, and the underlying dbt Cloud platform evolves. + +## Explore your project's lineage graph {#project-lineage} + +dbt Explorer provides a visualization of your project’s DAG that you can interact with. To access the project's full lineage graph, select **Overview** in the left sidebar and click the **Explore Lineage** button on the main (center) section of the page. + +If you don't see the project lineage graph immediately, click **Render Lineage**. It can take some time for the graph to render depending on the size of your project and your computer’s available memory. The graph of very large projects might not render so you can select a subset of nodes by using selectors, instead. + +The nodes in the lineage graph represent the project’s resources and the edges represent the relationships between the nodes. Nodes are color-coded and include iconography according to their resource type. + +To explore the lineage graphs of tests and macros, view [their resource details pages](#view-resource-details). By default, dbt Explorer excludes these resources from the full lineage graph unless a search query returns them as results. + +To interact with the full lineage graph, you can: + +- Hover over any item in the graph to display the resource’s name and type. +- Zoom in and out on the graph by mouse-scrolling. +- Grab and move the graph and the nodes. +- Select a resource to highlight its relationship with other resources in your project. A panel opens on the graph’s right-hand side that displays a high-level summary of the resource’s details. The side panel includes a **General** tab for information like description, materialized type, and other details. + - Click the Share icon in the side panel to copy the graph’s link to your clipboard. + - Click the View Resource icon in the side panel to [view the resource details](#view-resource-details). +- [Search and select specific resources](#search-resources) or a subset of the DAG using selectors and graph operators. For example: + - `+[RESOURCE_NAME]` — Displays all parent nodes of the resource + - `resource_type:model [RESOURCE_NAME]` — Displays all models matching the name search + +- [View resource details](#view-resource-details) by selecting a node (double-clicking) in the graph. +- Click the List view icon in the graph's upper right corner to return to the main **Explore** page. + + + + +## Search for resources {#search-resources} +With the search bar (on the upper left corner of the page or in a lineage graph), you can search with keywords or by using [node selection syntax](/reference/node-selection/syntax). The resources that match your search criteria will display as a lineage graph and a table in the main section of the page. + +Select a node (single-click) in the lineage graph to highlight its relationship with your other search results and to display which project contains the resource's definition. When you choose a node (double-click) in the lineage graph or when you select a resource in the table, dbt Explorer displays the [resource's details page](#view-resource-details). + +### Search with keywords +When searching with keywords, dbt Explorer searches through your resource metadata (such as resource type, resource name, column name, source name, tags, schema, database, version, alias/identifier, and package name) and returns any matches. + +### Search with selector methods + +You can search with [selector methods](/reference/node-selection/methods). Below are the selectors currently available in dbt Explorer: + +- `fqn:` — Find resources by [file or fully qualified name](/reference/node-selection/methods#the-fqn-method). This selector is the search bar's default. If you want to use the default, it's unnecessary to add `fqn:` before the search term. +- `source:` — Find resources by a specified [source](/reference/node-selection/methods#the-source-method). +- `resource_type:` — Find resources by their [type](/reference/node-selection/methods#the-resource_type-method). +- `package:` — Find resources by the [dbt package](/reference/node-selection/methods#the-package-method) that defines them. +- `tag:` — Find resources by a specified [tag](/reference/node-selection/methods#the-tag-method). + + + +- `group:` — Find models defined within a specified [group](/reference/node-selection/methods#the-group-method). +- `access:` — Find models based on their [access](/reference/node-selection/methods#the-access-method) property. + + + +### Search with graph operators + +You can use [graph operators](/reference/node-selection/graph-operators) on keywords or selector methods. For example, `+orders` returns all the parents of `orders`. + +### Search with set operators + +You can use multiple selector methods in your search query with [set operators](/reference/node-selection/set-operators). A space implies a union set operator and a comma for an intersection. For example: +- `resource_type:metric,tag:nightly` — Returns metrics with the tag `nightly` +- `+snowplow_sessions +fct_orders` — Returns resources that are parent nodes of either `snowplow_sessions` or `fct_orders` + +### Search with both keywords and selector methods + +You can use keyword search to highlight results that are filtered by the selector search. For example, if you don't have a resource called `customers`, then `resource_type:metric customers` returns all the metrics in your project and highlights those that are related to the term `customers` in the name, in a column, tagged as customers, and so on. + +When searching in this way, the selectors behave as filters that you can use to narrow the search and keywords as a way to find matches within those filtered results. + + + +## Browse with the sidebar + +By default, the catalog sidebar lists all your project’s resources. Select any resource type in the list and all those resources in the project will display as a table in the main section of the page. For a description on the different resource types (like models, metrics, and so on), refer to [About dbt projects](/docs/build/projects). + +To browse using a different view, you can choose one of these options from the **View by** dropdown: + +- **Resources** (default) — All resources in the project organized by type. +- **Packages** — All resources in the project organized by the dbt package in which they are defined. +- **File Tree** — All resources in the project organized by the file in which they are defined. This mirrors the file tree in your dbt project repository. +- **Database** — All resources in the project organized by the database and schema in which they are built. This mirrors your data platform's structure that represents the [applied state](/docs/dbt-cloud-apis/project-state) of your project. + + + +## View model versions + +If models in the project are versioned, you can see which [version of the model](/docs/collaborate/govern/model-versions) is being applied — `prerelease`, `latest`, and `old` — in the title of the model’s details page and in the model list from the sidebar. + +## View resource details {#view-resource-details} +You can view the definition and latest run results of any resource in your project. To find a resource and view its details, you can interact with the lineage graph, use search, or browse the catalog. + +The details (metadata) available to you depends on the resource’s type, its definition, and the [commands](/docs/deploy/job-commands) that run within jobs in the production environment. + + + + +### Example of model details + +An example of the details you might get for a model: + +- Status bar (below the page title) — Information on the last time the model ran, whether the run was successful, how the data is materialized, number of rows, and the size of the model. +- **General** tab includes: + - **Lineage** graph — The model’s lineage graph that you can interact with. The graph includes one parent node and one child node from the model. Click the Expand icon in the graph's upper right corner to view the model in full lineage graph mode. + - **Description** section — A [description of the model](/docs/collaborate/documentation#adding-descriptions-to-your-project). + - **Recent** section — Information on the last time the model ran, how long it ran for, whether the run was successful, the job ID, and the run ID. + - **Tests** section — [Tests](/docs/build/tests) for the model. + - **Details** section — Key properties like the model’s relation name (for example, how it’s represented and how you can query it in the data platform: `database.schema.identifier`); model governance attributes like access, group, and if contracted; and more. + - **Relationships** section — The nodes the model **Depends On**, is **Referenced by**, and (if applicable) is **Used by** for projects that have declared the models' project as a dependency. +- **Code** tab — The source code and compiled code for the model. +- **Columns** tab — The available columns in the model. This tab also shows tests results (if any) that you can select to view the test's details page. A :white_check_mark: denotes a passing test. + + +### Example of exposure details + +An example of the details you might get for an exposure: + +- Status bar (below the page title) — Information on the last time the exposure was updated. +- **General** tab includes: + - **Status** section — The status on data freshness and data quality. + - **Lineage** graph — The exposure’s lineage graph. Click the Expand icon in the graph's upper right corner to view the exposure in full lineage graph mode. + - **Description** section — A description of the exposure. + - **Details** section — Details like exposure type, maturity, owner information, and more. + - **Relationships** section — The nodes the exposure **Depends On**. + +### Example of test details + +An example of the details you might get for a test: + +- Status bar (below the page title) — Information on the last time the test ran, whether the test passed, test name, test target, and column name. +- **General** tab includes: + - **Lineage** graph — The test’s lineage graph that you can interact with. The graph includes one parent node and one child node from the test resource. Click the Expand icon in the graph's upper right corner to view the test in full lineage graph mode. + - **Description** section — A description of the test. + - **Recent** section — Information on the last time the test ran, how long it ran for, whether the test passed, the job ID, and the run ID. + - **Details** section — Details like schema, severity, package, and more. + - **Relationships** section — The nodes the test **Depends On**. +- **Code** tab — The source code and compiled code for the test. + + +### Example of source details + +An example of the details you might get for each source table within a source collection: + +- Status bar (below the page title) — Information on the last time the source was updated and the number of tables the source uses. +- **General** tab includes: + - **Lineage** graph — The source’s lineage graph that you can interact with. The graph includes one parent node and one child node from the source. Click the Expand icon in the graph's upper right corner to view the source in full lineage graph mode. + - **Description** section — A description of the source. + - **Source freshness** section — Information on whether refreshing the data was successful, the last time the source was loaded, the timestamp of when a run generated data, and the run ID. + - **Details** section — Details like database, schema, and more. + - **Relationships** section — A table that lists all the sources used with their freshness status, the timestamp of when freshness was last checked, and the timestamp of when the source was last loaded. +- **Columns** tab — The available columns in the source. This tab also shows tests results (if any) that you can select to view the test's details page. A :white_check_mark: denotes a passing test. + +## About project-level lineage +You can also view all the different projects and public models in the account, where the public models are defined, and how they are used to gain a better understanding about your cross-project resources. + +When viewing the resource-level lineage graph for a given project that uses cross-project references, you can see cross-project relationships represented in the DAG. The iconography is slightly different depending on whether you're viewing the lineage of an upstream producer project or a downstream consumer project. + +When viewing an upstream (parent) project that produces public models that are imported by downstream (child) projects, public models will have a counter icon in their upper right corner that indicates the number of projects that declare the current project as a dependency. Selecting that model reveals the lineage to show the specific projects that are dependent on this model. Projects show up in this counter if they declare the parent project as a dependency in its `dependencies.yml` regardless of whether or not there's a direct `{{ ref() }}` against the public model. Selecting a project node from a public model opens the resource-level lineage graph for that project, which is subject to your permissions. + + + +When viewing a downstream (child) project that imports and refs public models from upstream (parent) projects, public models will show up in the lineage graph and display an icon on the graph edge that indicates what the relationship is to a model from another project. Hovering over this icon indicates the specific dbt Cloud project that produces that model. Double-clicking on a model from another project opens the resource-level lineage graph of the parent project, which is subject to your permissions. + + + + +### Explore the project-level lineage graph + +For cross-project collaboration, you can interact with the DAG in all the same ways as described in [Explore your project's lineage](#project-lineage) but you can also interact with it at the project level and view the details. + +To get a list view of all the projects, select the account name at the top of the **Explore** page near the navigation bar. This view includes a public model list, project list, and a search bar for project searches. You can also view the project-level lineage graph by clicking the Lineage view icon in the page's upper right corner. + +If you have permissions for a project in the account, you can view all public models used across the entire account. However, you can only view full public model details and private models if you have permissions for a project where the models are defined. + +From the project-level lineage graph, you can: + +- Click the Lineage view icon (in the graph’s upper right corner) to view the cross-project lineage graph. +- Click the List view icon (in the graph’s upper right corner) to view the project list. + - Select a project from the **Projects** tab to switch to that project’s main **Explore** page. + - Select a model from the **Public Models** tab to view the [model’s details page](#view-resource-details). + - Perform searches on your projects with the search bar. +- Select a project node in the graph (double-clicking) to switch to that particular project’s lineage graph. + +When you select a project node in the graph, a project details panel opens on the graph’s right-hand side where you can: + +- View counts of the resources defined in the project. +- View a list of its public models, if any. +- View a list of other projects that uses the project, if any. +- Click **Open Project Lineage** to switch to the project’s lineage graph. +- Click the Share icon to copy the project panel link to your clipboard so you can share the graph with someone. + + + +## Related content +- [Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions) +- [About model governance](/docs/collaborate/govern/about-model-governance) +- [What is data mesh?](https://www.getdbt.com/blog/what-is-data-mesh-the-definition-and-importance-of-data-mesh) blog diff --git a/website/docs/docs/collaborate/git-version-control.md b/website/docs/docs/collaborate/git-version-control.md index 4444f381bb5..392e2c3baa5 100644 --- a/website/docs/docs/collaborate/git-version-control.md +++ b/website/docs/docs/collaborate/git-version-control.md @@ -3,6 +3,8 @@ title: "About git" id: git-version-control description: "Git overview" sidebar_label: "About git" +pagination_next: "docs/collaborate/git/version-control-basics" +pagination_prev: null --- A [version control](https://en.wikipedia.org/wiki/Version_control) system allows you and your teammates to work collaboratively, safely, and simultaneously on a single project. Version control helps you track all the code changes made in your dbt project. @@ -22,3 +24,4 @@ When you develop in the command line interface (CLI) or Cloud integrated develo - [Merge conflicts](/docs/collaborate/git/merge-conflicts) - [Connect to GitHub](/docs/cloud/git/connect-github) - [Connect to GitLab](/docs/cloud/git/connect-gitlab) +- [Connect to Azure DevOps](/docs/cloud/git/connect-azure-devops) diff --git a/website/docs/docs/collaborate/git/managed-repository.md b/website/docs/docs/collaborate/git/managed-repository.md index d7beb38c4f5..db8e9840ccd 100644 --- a/website/docs/docs/collaborate/git/managed-repository.md +++ b/website/docs/docs/collaborate/git/managed-repository.md @@ -15,6 +15,6 @@ To set up a project with a managed repository: 6. Click **Create**. -dbt Cloud will host and manage this repository for you. If in the future you choose to host this repository yourself, you can contact support to have the contents of your repo transferred to you. +dbt Cloud will host and manage this repository for you. If in the future you choose to host this repository elsewhere, you can export the information from dbt Cloud at any time. ** We do not recommend using a managed repository in a production environment. You will not be able to use git features like pull requests which are part of our recommended version control best practices. diff --git a/website/docs/docs/collaborate/git/merge-conflicts.md b/website/docs/docs/collaborate/git/merge-conflicts.md index b109cacb511..c3c19b1e2a1 100644 --- a/website/docs/docs/collaborate/git/merge-conflicts.md +++ b/website/docs/docs/collaborate/git/merge-conflicts.md @@ -1,6 +1,7 @@ --- title: "Merge conflicts" id: "merge-conflicts" +pagination_next: null --- [Merge conflicts](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/addressing-merge-conflicts/about-merge-conflicts) in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) often occur when multiple users are simultaneously making edits to the same section in the same file. This makes it difficult for Git to decide what changes to incorporate in the final merge. diff --git a/website/docs/docs/collaborate/git/pr-template.md b/website/docs/docs/collaborate/git/pr-template.md index 83d620b7af9..ddb4948dad9 100644 --- a/website/docs/docs/collaborate/git/pr-template.md +++ b/website/docs/docs/collaborate/git/pr-template.md @@ -72,7 +72,7 @@ https://gitlab.com///-/merge_requests/new?merge_request[source_branch ### BitBucket ``` -https://bitbucket.org///pull-requests/new?source={{source}} +https://bitbucket.org///pull-requests/new?source={{source}}&dest={{destination}} ``` ### AWS CodeCommit diff --git a/website/docs/docs/collaborate/govern/about-model-governance.md b/website/docs/docs/collaborate/govern/about-model-governance.md index efeb2836bc6..bbc430845d2 100644 --- a/website/docs/docs/collaborate/govern/about-model-governance.md +++ b/website/docs/docs/collaborate/govern/about-model-governance.md @@ -2,6 +2,8 @@ title: "About model governance" id: about-model-governance description: "Information about new features related to model governance" +pagination_next: "docs/collaborate/govern/model-access" +pagination_prev: null --- diff --git a/website/docs/docs/collaborate/govern/model-access.md b/website/docs/docs/collaborate/govern/model-access.md index 970f25ef87f..76eb8bd6f6d 100644 --- a/website/docs/docs/collaborate/govern/model-access.md +++ b/website/docs/docs/collaborate/govern/model-access.md @@ -25,17 +25,17 @@ The two concepts will be closely related, as we develop multi-project collaborat ## Related documentation * [`groups`](/docs/build/groups) -* [`access`](/reference/resource-properties/access) +* [`access`](/reference/resource-configs/access) ## Groups -Models can be grouped under a common designation with a shared owner. For example, you could group together all models owned by a particular team, related to modeling a specific data source (`github`), or +Models can be grouped under a common designation with a shared owner. For example, you could group together all models owned by a particular team, or related to modeling a specific data source (`github`). Why define model `groups`? There are two reasons: - It turns implicit relationships into an explicit grouping, with a defined owner. By thinking about the interface boundaries _between_ groups, you can have a cleaner (less entangled) DAG. In the future, those interface boundaries could be appropriate as the interfaces between separate projects. - It enables you to designate certain models as having "private" access—for use exclusively within that group. Other models will be restricted from referencing (taking a dependency on) those models. In the future, they won't be visible to other teams taking a dependency on your project—only "public" models will be. -If you follow our [best practices for structuring a dbt project](/guides/best-practices/how-we-structure/1-guide-overview), you're probably already using subdirectories to organize your dbt project. It's easy to apply a `group` label to an entire subdirectory at once: +If you follow our [best practices for structuring a dbt project](/best-practices/how-we-structure/1-guide-overview), you're probably already using subdirectories to organize your dbt project. It's easy to apply a `group` label to an entire subdirectory at once: diff --git a/website/docs/docs/collaborate/govern/model-contracts.md b/website/docs/docs/collaborate/govern/model-contracts.md index 97667996194..342d86c1a77 100644 --- a/website/docs/docs/collaborate/govern/model-contracts.md +++ b/website/docs/docs/collaborate/govern/model-contracts.md @@ -86,6 +86,91 @@ When building a model with a defined contract, dbt will do two things differentl 1. dbt will run a "preflight" check to ensure that the model's query will return a set of columns with names and data types matching the ones you have defined. This check is agnostic to the order of columns specified in your model (SQL) or YAML spec. 2. dbt will include the column names, data types, and constraints in the DDL statements it submits to the data platform, which will be enforced while building or updating the model's table. +## Platform constraint support + +Select the adapter-specific tab for more information on [constraint](/reference/resource-properties/constraints) support across platforms. Constraints fall into three categories based on support and platform enforcement: + +- **Supported and enforced** — The model won't build if it violates the constraint. +- **Supported and not enforced** — The platform supports specifying the type of constraint, but a model can still build even if building the model violates the constraint. This constraint exists for metadata purposes only. This is common for modern cloud data warehouses and less common for legacy databases. +- **Not supported and not enforced** — You can't specify the type of constraint for the platform. + + + + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:-----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ❌ Not supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:--------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ✅ Enforced | +| foreign_key | ✅ Supported | ✅ Enforced | +| unique | ✅ Supported | ✅ Enforced | +| check | ✅ Supported | ✅ Enforced | + + + + +Currently, `not_null` and `check` constraints are supported and enforced only after a model builds. Because of this platform limitation, dbt considers these constraints `supported` but `not enforced`, which means they're not part of the "model contract" since these constraints can't be enforced at build time. This table will change as the features evolve. + +| Constraint type | Support | Platform enforcement | +|:----------------|:------------|:---------------------| +| not_null | ✅ Supported | ❌ Not enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ✅ Supported | ❌ Not enforced | + + + + +Currently, `not_null` and `check` constraints are supported and enforced only after a model builds. Because of this platform limitation, dbt considers these constraints `supported` but `not enforced`, which means they're not part of the "model contract" since these constraints can't be enforced at build time. This table will change as the features evolve. + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ❌ Not enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ✅ Supported | ❌ Not enforced | + + + + + ## FAQs ### Which models should have contracts? @@ -98,7 +183,7 @@ Any model meeting the criteria described above _can_ define a contract. We recom A model's contract defines the **shape** of the returned dataset. If the model's logic or input data doesn't conform to that shape, the model does not build. -[Tests](docs/build/tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the test. Tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). +[Tests](/docs/build/tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the test. Tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). In some cases, you can replace a test with its equivalent constraint. This has the advantage of guaranteeing the validation at build time, and it probably requires less compute (cost) in your data platform. The prerequisites for replacing a test with a constraint are: - Making sure that your data platform can support and enforce the constraint that you need. Most platforms only enforce `not_null`. @@ -107,8 +192,21 @@ In some cases, you can replace a test with its equivalent constraint. This has t **Why aren't tests part of the contract?** In a parallel for software APIs, the structure of the API response is the contract. Quality and reliability ("uptime") are also very important attributes of an API's quality, but they are not part of the contract per se. When the contract changes in a backwards-incompatible way, it is a breaking change that requires a bump in major version. -### Can I define a "partial" contract? +### Do I need to define every column for a contract? Currently, dbt contracts apply to **all** columns defined in a model, and they require declaring explicit expectations about **all** of those columns. The explicit declaration of a contract is not an accident—it's very much the intent of this feature. -We are investigating the feasibility of supporting "inferred" or "partial" contracts in the future. This would enable you to define constraints and strict data typing for a subset of columns, while still detecting breaking changes on other columns by comparing against the same model in production. If you're interested, please upvote or comment on [dbt-core#7432](https://github.com/dbt-labs/dbt-core/issues/7432). +At the same time, for models with many columns, we understand that this can mean a _lot_ of yaml. We are investigating the feasibility of supporting "inferred" contracts. This would enable you to define constraints and strict data typing for a subset of columns, while still detecting breaking changes on other columns by comparing against the same model in production. This isn't the same as a "partial" contract, because all columns in the model are still checked at runtime, and matched up with what's defined _explicitly_ in your yaml contract or _implicitly_ with the comparison state. If you're interested in "inferred" contract, please upvote or comment on [dbt-core#7432](https://github.com/dbt-labs/dbt-core/issues/7432). + + +### How are breaking changes handled? + +When comparing to a previous project state, dbt will look for breaking changes that could impact downstream consumers. If breaking changes are detected, dbt will present a contract error. + +Breaking changes include: +- Removing an existing column +- Changing the `data_type` of an existing column +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) + +More details are available in the [contract reference](/reference/resource-configs/contract#detecting-breaking-changes). + diff --git a/website/docs/docs/collaborate/govern/model-versions.md b/website/docs/docs/collaborate/govern/model-versions.md index 12599d0b65f..49ed65f9a36 100644 --- a/website/docs/docs/collaborate/govern/model-versions.md +++ b/website/docs/docs/collaborate/govern/model-versions.md @@ -3,20 +3,28 @@ title: "Model versions" id: model-versions sidebar_label: "Model versions" description: "Version models to help with lifecycle management" +keyword: governance, model version, model versioning, dbt model versioning --- + :::info New functionality This functionality is new in v1.5 — if you have thoughts, participate in [the discussion on GitHub](https://github.com/dbt-labs/dbt-core/discussions/6736)! ::: + + +import VersionsCallout from '/snippets/_version-callout.md'; + + + Versioning APIs is a hard problem in software engineering. The root of the challenge is that the producers and consumers of an API have competing incentives: - Producers of an API need the ability to modify its logic and structure. There is a real cost to maintaining legacy endpoints forever, but losing the trust of downstream users is far costlier. - Consumers of an API need to trust in its stability: their queries will keep working, and won't break without warning. Although migrating to a newer API version incurs an expense, an unplanned migration is far costlier. When sharing a final dbt model with other teams or systems, that model is operating like an API. When the producer of that model needs to make significant changes, how can they avoid breaking the queries of its users downstream? -Model versioning is a tool to tackle this problem, thoughtfully and head-on. The goal of is not to make the problem go away entirely, nor to pretend it's easier or simpler than it is. +Model versioning is a tool to tackle this problem, thoughtfully and head-on. The goal is not to make the problem go away entirely, nor to pretend it's easier or simpler than it is. ## Related documentation - [`versions`](/reference/resource-properties/versions) diff --git a/website/docs/docs/collaborate/govern/project-dependencies.md b/website/docs/docs/collaborate/govern/project-dependencies.md index 158c405e4a7..174e4572890 100644 --- a/website/docs/docs/collaborate/govern/project-dependencies.md +++ b/website/docs/docs/collaborate/govern/project-dependencies.md @@ -3,12 +3,17 @@ title: "Project dependencies" id: project-dependencies sidebar_label: "Project dependencies" description: "Reference public models across dbt projects" +pagination_next: null --- -:::caution Closed Beta - dbt Cloud Enterprise -"Project" dependencies and cross-project `ref` are features of dbt Cloud Enterprise, currently in Closed Beta. To access these features while they are in beta, please contact your account team at dbt Labs. +:::info Available in Public Preview for dbt Cloud Enterprise accounts + +Project dependencies and cross-project `ref` are features available in [dbt Cloud Enterprise](https://www.getdbt.com/pricing), currently in [Public Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud). + +Enterprise users can use these features by designating a [public model](/docs/collaborate/govern/model-access) and adding a [cross-project ref](#how-to-use-ref). ::: + For a long time, dbt has supported code reuse and extension by installing other projects as [packages](/docs/build/packages). When you install another project as a package, you are pulling in its full source code, and adding it to your own. This enables you to call macros and run models defined in that other project. While this is a great way to reuse code, share utility macros, and establish a starting point for common transformations, it's not a great way to enable collaboration across teams and at scale, especially at larger organizations. @@ -17,6 +22,33 @@ This year, dbt Labs is introducing an expanded notion of `dependencies` across m - **Packages** — Familiar and pre-existing type of dependency. You take this dependency by installing the package's full source code (like a software library). - **Projects** — A _new_ way to take a dependency on another project. Using a metadata service that runs behind the scenes, dbt Cloud resolves references on-the-fly to public models defined in other projects. You don't need to parse or run those upstream models yourself. Instead, you treat your dependency on those models as an API that returns a dataset. The maintainer of the public model is responsible for guaranteeing its quality and stability. + +Starting in dbt v1.6 or higher, `packages.yml` has been renamed to `dependencies.yml`. However, if you need use Jinja within your packages config, such as an environment variable for your private package, you need to keep using `packages.yml` for your packages for now. Refer to the [FAQs](#faqs) for more info. + +## Prerequisites + +In order to add project dependencies and resolve cross-project `ref`, you must: +- Use dbt v1.6 or higher for **both** the upstream ("producer") project and the downstream ("consumer") project. +- Have a deployment environment in the upstream ("producer") project [that is set to be your production environment](/docs/deploy/deploy-environments#set-as-production-environment) +- Have a successful run of the upstream ("producer") project +- Have a multi-tenant or single-tenant [dbt Cloud Enterprise](https://www.getdbt.com/pricing) account (Azure ST is not supported but coming soon) + + ## Example As an example, let's say you work on the Marketing team at the Jaffle Shop. The name of your team's project is `jaffle_marketing`: @@ -30,7 +62,7 @@ name: jaffle_marketing As part of your modeling of marketing data, you need to take a dependency on two other projects: -- `dbt_utils` as a [package](#packages-use-case): An collection of utility macros that you can use while writing the SQL for your own models. This package is, open-source public, and maintained by dbt Labs. +- `dbt_utils` as a [package](#packages-use-case): A collection of utility macros that you can use while writing the SQL for your own models. This package is, open-source public, and maintained by dbt Labs. - `jaffle_finance` as a [project use-case](#projects-use-case): Data models about the Jaffle Shop's revenue. This project is private and maintained by your colleagues on the Finance team. You want to select from some of this project's final models, as a starting point for your own work. @@ -60,7 +92,7 @@ When you're building on top of another team's work, resolving the references in - You don't need to mirror any conditional configuration of the upstream project such as `vars`, environment variables, or `target.name`. You can reference them directly wherever the Finance team is building their models in production. Even if the Finance team makes changes like renaming the model, changing the name of its schema, or [bumping its version](/docs/collaborate/govern/model-versions), your `ref` would still resolve successfully. - You eliminate the risk of accidentally building those models with `dbt run` or `dbt build`. While you can select those models, you can't actually build them. This prevents unexpected warehouse costs and permissions issues. This also ensures proper ownership and cost allocation for each team's models. -### Usage +### How to use ref **Writing `ref`:** Models referenced from a `project`-type dependency must use [two-argument `ref`](/reference/dbt-jinja-functions/ref#two-argument-variant), including the project name: @@ -81,6 +113,8 @@ with monthly_revenue as ( **Cycle detection:** Currently, "project" dependencies can only go in one direction, meaning that the `jaffle_finance` project could not add a new model that depends, in turn, on `jaffle_marketing.roi_by_channel`. dbt will check for cycles across projects and raise errors if any are detected. We are considering support for this pattern in the future, whereby dbt would still check for node-level cycles while allowing cycles at the project level. +For more guidance on how to use dbt Mesh, refer to the dedicated [dbt Mesh guide](/best-practices/how-we-mesh/mesh-1-intro). + ### Comparison If you were to instead install the `jaffle_finance` project as a `package` dependency, you would instead be pulling down its full source code and adding it to your runtime environment. This means: @@ -93,4 +127,16 @@ There are a few cases where installing another internal project as a package can - Unified deployments — In a production environment, if the central data platform team of Jaffle Shop wanted to schedule the deployment of models across both `jaffle_finance` and `jaffle_marketing`, they could use dbt's [selection syntax](/reference/node-selection/syntax) to create a new "passthrough" project that installed both projects as packages. - Coordinated changes — In development, if you wanted to test the effects of a change to a public model in an upstream project (`jaffle_finance.monthly_revenue`) on a downstream model (`jaffle_marketing.roi_by_channel`) _before_ introducing changes to a staging or production environment, you can install the `jaffle_finance` package as a package within `jaffle_marketing`. The installation can point to a specific git branch, however, if you find yourself frequently needing to perform end-to-end testing across both projects, we recommend you re-examine if this represents a stable interface boundary. -These are the exceptions, rather than the rule. Installing another team's project as a package adds complexity, latency, and risk of unnecessary costs. By defining clear interface boundaries across teams, by serving one team's public models as "APIs" to another, and by enabling practitioners to develop with a more narrowly-defined scope, we can enable more people to contribute, with more confidence, while requiring less context upfront. +These are the exceptions, rather than the rule. Installing another team's project as a package adds complexity, latency, and risk of unnecessary costs. By defining clear interface boundaries across teams, by serving one team's public models as "APIs" to another, and by enabling practitioners to develop with a more narrowly defined scope, we can enable more people to contribute, with more confidence, while requiring less context upfront. + +## FAQs + +
+Can I define private packages in the dependencies.yml file? + +If you're using private packages with the [git token method](/docs/build/packages#git-token-method), you must define them in the `packages.yml` file instead of the `dependencies.yml` file. This is because conditional rendering (like Jinja-in-yaml) is not supported. +
+ + +## Related docs +- Refer to the [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro) guide for more guidance on how to use dbt Mesh. diff --git a/website/docs/docs/community-adapters.md b/website/docs/docs/community-adapters.md index 6569a78459b..444ea0e04b4 100644 --- a/website/docs/docs/community-adapters.md +++ b/website/docs/docs/community-adapters.md @@ -11,10 +11,10 @@ Community adapters are adapter plugins contributed and maintained by members of | [Clickhouse](/docs/core/connect-data-platform/clickhouse-setup) | [Hive](/docs/core/connect-data-platform/hive-setup) | [Rockset](/docs/core/connect-data-platform/rockset-setup) | | [IBM DB2](/docs/core/connect-data-platform/ibmdb2-setup) | [Impala](/docs/core/connect-data-platform/impala-setup) | [SingleStore](/docs/core/connect-data-platform/singlestore-setup) | | [Doris & SelectDB](/docs/core/connect-data-platform/doris-setup) | [Infer](/docs/core/connect-data-platform/infer-setup) | [SQLite](/docs/core/connect-data-platform/sqlite-setup) | -| [DuckDB](/docs/core/connect-data-platform/duckdb-setup) | [iomete](/docs/core/connect-data-platform/iomete-setup) | [SQL Server & Azure SQL](/docs/core/connect-data-platform/mssql-setup) | -| [Dremio](/docs/core/connect-data-platform/dremio-setup) | [Layer](/docs/core/connect-data-platform/layer-setup) | [Teradata](/docs/core/connect-data-platform/teradata-setup) | -| [Exasol Analytics](/docs/core/connect-data-platform/exasol-setup) | [Materialize](/docs/core/connect-data-platform/materialize-setup) | [TiDB](/docs/core/connect-data-platform/tidb-setup) | -| [Firebolt](/docs/core/connect-data-platform/firebolt-setup) | [MindsDB](/docs/core/connect-data-platform/mindsdb-setup) | [Vertica](/docs/core/connect-data-platform/vertica-setup) | -| [AWS Glue](/docs/core/connect-data-platform/glue-setup) | [MySQL](/docs/core/connect-data-platform/mysql-setup)| | -| [Databend Cloud](/docs/core/connect-data-platform/databend-setup) | [fal - Python models](/docs/core/connect-data-platform/fal-setup) | | +| [Starrocks](/docs/core/connect-data-platform/starrocks-setup) | [DuckDB](/docs/core/connect-data-platform/duckdb-setup) | [iomete](/docs/core/connect-data-platform/iomete-setup) +| [SQL Server & Azure SQL](/docs/core/connect-data-platform/mssql-setup) | [Dremio](/docs/core/connect-data-platform/dremio-setup) | [Layer](/docs/core/connect-data-platform/layer-setup) +| [Teradata](/docs/core/connect-data-platform/teradata-setup) | [Exasol Analytics](/docs/core/connect-data-platform/exasol-setup) | [Materialize](/docs/core/connect-data-platform/materialize-setup) +| [TiDB](/docs/core/connect-data-platform/tidb-setup) | [Firebolt](/docs/core/connect-data-platform/firebolt-setup) | [MindsDB](/docs/core/connect-data-platform/mindsdb-setup) +| [Vertica](/docs/core/connect-data-platform/vertica-setup) | [AWS Glue](/docs/core/connect-data-platform/glue-setup) | [MySQL](/docs/core/connect-data-platform/mysql-setup) | +| [Upsolver](/docs/core/connect-data-platform/upsolver-setup) | [Databend Cloud](/docs/core/connect-data-platform/databend-setup) | [fal - Python models](/docs/core/connect-data-platform/fal-setup) | diff --git a/website/docs/docs/connect-adapters.md b/website/docs/docs/connect-adapters.md index 5632fb3793e..e301cfc237e 100644 --- a/website/docs/docs/connect-adapters.md +++ b/website/docs/docs/connect-adapters.md @@ -3,34 +3,20 @@ title: "How to connect to adapters" id: "connect-adapters" --- -Adapters are an essential component of dbt. At their most basic level, they are how dbt connects with the various supported data platforms. At a higher-level, adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt — for more detail, read the [What are adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) guide. +Adapters are an essential component of dbt. At their most basic level, they are how dbt connects with the various supported data platforms. At a higher-level, adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt — for more detail, refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide. -This section provides more details on different ways you can connect dbt to an adapter, and explains what a maintainer is. +This section provides more details on different ways you can connect dbt to an adapter, and explains what a maintainer is. ### Set up in dbt Cloud -Explore the fastest and most reliable way to deploy dbt using dbt Cloud, a hosted architecture that runs dbt Core across your organization. dbt Cloud lets you seamlessly [connect](/docs/cloud/about-cloud-setup) with a variety of [verified](/docs/supported-data-platforms) data platform providers directly in the dbt Cloud UI. +Explore the fastest and most reliable way to deploy dbt using dbt Cloud, a hosted architecture that runs dbt Core across your organization. dbt Cloud lets you seamlessly [connect](/docs/cloud/about-cloud-setup) with a variety of [verified](/docs/supported-data-platforms) data platform providers directly in the dbt Cloud UI. -dbt Cloud supports data platforms that are verified and [maintained](#maintainers) by dbt Labs or partners. This level of support ensures that users can trust certain adapters for use in production. +### Install with dbt Core -### Install using the CLI - -Install dbt Core, which is an open-source tool, locally using the CLI. dbt communicates with a number of different data platforms by using a dedicated adapter plugin for each. When you install dbt Core, you'll also need to install the specific adapter for your database, [connect to dbt Core](/docs/core/about-core-setup), and set up a `profiles.yml` file. - -Data platforms supported in dbt Core may be verified or unverified, and are [maintained](#maintainers) by dbt Labs, partners, or community members. +Install dbt Core, an open-source tool, locally using the command line. dbt communicates with a number of different data platforms by using a dedicated adapter plugin for each. When you install dbt Core, you'll also need to install the specific adapter for your database, [connect to dbt Core](/docs/core/about-core-setup), and set up a `profiles.yml` file. With a few exceptions [^1], you can install all [Verified adapters](/docs/supported-data-platforms) from PyPI using `pip install adapter-name`. For example to install Snowflake, use the command `pip install dbt-snowflake`. The installation will include `dbt-core` and any other required dependencies, which may include both other dependencies and even other adapter plugins. Read more about [installing dbt](/docs/core/installation). - -## Maintainers - -Who made and maintains an adapter is certainly relevant, but we recommend using an adapter's verification status to determine the quality and health of an adapter. So far there are three categories of maintainers: - -| Supported by | Maintained By | -| ------------ | ---------------- | -| dbt Labs | dbt Labs maintains a set of adapter plugins for some of the most common databases, warehouses, and platforms. As for why particular data platforms were chosen, see ["Why Verify an Adapter"](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter#why-verify-an-adapter) | -| Partner | These adapter plugins are built and maintained by the same people who build and maintain the complementary data technology. | -| Community | These adapter plugins are contributed and maintained by members of the community. 🌱 | [^1]: Here are the two different adapters. Use the PyPI package name when installing with `pip` | Adapter repo name | PyPI package name | diff --git a/website/docs/docs/contribute-core-adapters.md b/website/docs/docs/contribute-core-adapters.md index 6e66a5d28ff..d3b1edf2a38 100644 --- a/website/docs/docs/contribute-core-adapters.md +++ b/website/docs/docs/contribute-core-adapters.md @@ -1,6 +1,7 @@ --- title: "Contribute to adapters" id: "contribute-core-adapters" +pagination_next: null --- The dbt Community exists to allow analytics practitioners share their knowledge, help others and collectively to drive forward the discipline of analytics engineering. There are opportunities here for everyone to contribute whether you're at the beginning your analytics engineering journey or you are a seasoned data professional. @@ -16,6 +17,6 @@ Community-supported plugins are works in progress, and anyone is welcome to cont ### Create a new adapter -If you see something missing from the lists above, and you're interested in developing an integration, read more about adapters and how they're developed in the [Adapter Development](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) section. +If you see something missing from the lists above, and you're interested in developing an integration, read more about adapters and how they're developed in the [Build, test, document, and promote adapters](/guides/adapter-creation). -If you have a new adapter, please add it to this list using a pull request! See [Documenting your adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) for more information. +If you have a new adapter, please add it to this list using a pull request! You can refer to [Build, test, document, and promote adapters](/guides/adapter-creation) for more information on documenting your adapter. diff --git a/website/docs/docs/core/about-core-setup.md b/website/docs/docs/core/about-core-setup.md index 0408e529b2d..64e7694b793 100644 --- a/website/docs/docs/core/about-core-setup.md +++ b/website/docs/docs/core/about-core-setup.md @@ -3,15 +3,17 @@ title: About dbt Core setup id: about-core-setup description: "Configuration settings for dbt Core." sidebar_label: "About dbt Core setup" +pagination_next: "docs/core/about-dbt-core" +pagination_prev: null --- dbt Core is an [open-source](https://github.com/dbt-labs/dbt-core) tool that enables data teams to transform data using analytics engineering best practices. You can install dbt locally in your environment and use dbt Core on the command line. It can communicate with databases through adapters. This section of our docs will guide you through various settings to get started: -- [About the CLI](/docs/core/about-the-cli) +- [About dbt Core](/docs/core/about-dbt-core) - [Installing dbt](/docs/core/installation) - [Connecting to a data platform](/docs/core/connect-data-platform/profiles.yml) - [How to run your dbt projects](/docs/running-a-dbt-project/run-your-dbt-projects) -If you need a more detailed first-time setup guide for specific data platforms, read our [quickstart guides](https://docs.getdbt.com/quickstarts). +If you need a more detailed first-time setup guide for specific data platforms, read our [quickstart guides](https://docs.getdbt.com/guides). diff --git a/website/docs/docs/core/about-dbt-core.md b/website/docs/docs/core/about-dbt-core.md new file mode 100644 index 00000000000..a35d92420f3 --- /dev/null +++ b/website/docs/docs/core/about-dbt-core.md @@ -0,0 +1,25 @@ +--- +title: "About dbt Core" +id: "about-dbt-core" +sidebar_label: "About dbt Core" +--- + +[dbt Core](https://github.com/dbt-labs/dbt-core) is an open sourced project where you can develop from the command line and run your dbt project. + +To use dbt Core, your workflow generally looks like: + +1. **Build your dbt project in a code editor —** popular choices include VSCode and Atom. + +2. **Run your project from the command line —** macOS ships with a default Terminal program, however you can also use iTerm or the command line prompt within a code editor to execute dbt commands. + +:::info How we set up our computers for working on dbt projects + +We've written a [guide](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) for our recommended setup when running dbt projects using dbt Core. + +::: + +If you're using the command line, we recommend learning some basics of your terminal to help you work more effectively. In particular, it's important to understand `cd`, `ls` and `pwd` to be able to navigate through the directory structure of your computer easily. + +You can find more information on installing and setting up the dbt Core [here](/docs/core/installation). + +**Note** — dbt supports a dbt Cloud CLI and dbt Core, both command line interface tools that enable you to run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). diff --git a/website/docs/docs/core/about-the-cli.md b/website/docs/docs/core/about-the-cli.md deleted file mode 100644 index d05fb514dfa..00000000000 --- a/website/docs/docs/core/about-the-cli.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "About the CLI" -id: "about-the-cli" -sidebar_label: "About the CLI" ---- - -dbt ships with a command line interface (CLI) for running your dbt project. This way of running dbt and a dbt project is free and open source. - -To use the CLI, your workflow generally looks like: -1. **Build your dbt project in a code editor —** popular choices include VSCode and Atom. - -1. **Run your project from the command line —** macOS ships with a default Terminal program, however you can also use iTerm or the command line prompt within a code editor to execute dbt commands. - -:::info How we set up our computers for working on dbt projects - -We've written a [guide](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) for our recommended setup when running dbt projects using the CLI. - -::: - -If you're using the CLI, we recommend learning some basics of your terminal to help you work more effectively. In particular, it's important to understand `cd`, `ls` and `pwd` to be able to navigate through the directory structure of your computer easily. - -You can find more information on installing and setting up the dbt CLI [here](/dbt-cli/cli-overview). diff --git a/website/docs/docs/core/connect-data-platform/about-core-connections.md b/website/docs/docs/core/connect-data-platform/about-core-connections.md index 802e197514c..492e5ae878a 100644 --- a/website/docs/docs/core/connect-data-platform/about-core-connections.md +++ b/website/docs/docs/core/connect-data-platform/about-core-connections.md @@ -4,6 +4,8 @@ id: "about-core-connections" description: "Information about data platform connections in dbt Core" sidebar_label: "About data platform connections in dbt Core" hide_table_of_contents: true +pagination_next: "docs/core/connect-data-platform/profiles.yml" +pagination_prev: null --- dbt Core can connect with a variety of data platform providers including: @@ -20,7 +22,7 @@ dbt communicates with a number of different data platforms by using a dedicated Data platforms supported in dbt Core may be verified or unverified, and maintained by dbt Labs, partners, or community members. -These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/docs/quickstarts/overview) +These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/docs/guides) ## Connection profiles diff --git a/website/docs/docs/core/connect-data-platform/alloydb-setup.md b/website/docs/docs/core/connect-data-platform/alloydb-setup.md index c3f3ee9cfca..c01ba06d887 100644 --- a/website/docs/docs/core/connect-data-platform/alloydb-setup.md +++ b/website/docs/docs/core/connect-data-platform/alloydb-setup.md @@ -3,7 +3,7 @@ title: "AlloyDB setup" meta: maintained_by: Community? authors: 'dbt-labs' - github_repo: 'dbt-labs/dbt-postgres' + github_repo: 'dbt-labs/dbt-core' pypi_package: 'dbt-postgres' min_core_version: 'v1.0.0' cloud_support: Not Supported diff --git a/website/docs/docs/core/connect-data-platform/bigquery-setup.md b/website/docs/docs/core/connect-data-platform/bigquery-setup.md index b0fc9fa7cf0..96eafadea3b 100644 --- a/website/docs/docs/core/connect-data-platform/bigquery-setup.md +++ b/website/docs/docs/core/connect-data-platform/bigquery-setup.md @@ -11,7 +11,7 @@ meta: min_supported_version: 'n/a' slack_channel_name: '#db-bigquery' slack_channel_link: 'https://getdbt.slack.com/archives/C99SNSRTK' - platform_name: 'Big Query' + platform_name: 'BigQuery' config_page: '/reference/resource-configs/bigquery-configs' --- @@ -74,32 +74,23 @@ my-bigquery-db: dev: type: bigquery method: oauth - project: [GCP project id] - dataset: [the name of your dbt dataset] # You can also use "schema" here - threads: [1 or more] - [](#optional-configurations): + project: GCP_PROJECT_ID + dataset: DBT_DATASET_NAME # You can also use "schema" here + threads: 4 # Must be a value of 1 or greater + [OPTIONAL_CONFIG](#optional-configurations): VALUE ```
**Default project** -New in dbt v0.19.0 - If you do not specify a `project`/`database` and are using the `oauth` method, dbt will use the default `project` associated with your user, as defined by `gcloud config set`. ### OAuth Token-Based See [docs](https://developers.google.com/identity/protocols/oauth2) on using OAuth 2.0 to access Google APIs. - - - +#### Refresh token Using the refresh token and client information, dbt will mint new access tokens as necessary. @@ -112,21 +103,19 @@ my-bigquery-db: dev: type: bigquery method: oauth-secrets - project: [GCP project id] - dataset: [the name of your dbt dataset] # You can also use "schema" here - threads: [1 or more] - refresh_token: [token] - client_id: [client id] - client_secret: [client secret] - token_uri: [redirect URI] - [](#optional-configurations): + project: GCP_PROJECT_ID + dataset: DBT_DATASET_NAME # You can also use "schema" here + threads: 4 # Must be a value of 1 or greater + refresh_token: TOKEN + client_id: CLIENT_ID + client_secret: CLIENT_SECRET + token_uri: REDIRECT_URI + [OPTIONAL_CONFIG](#optional-configurations): VALUE ``` - - - +#### Temporary token dbt will use the one-time access token, no questions asked. This approach makes sense if you have an external deployment process that can mint new access tokens and update the profile file accordingly. @@ -139,18 +128,15 @@ my-bigquery-db: dev: type: bigquery method: oauth-secrets - project: [GCP project id] - dataset: [the name of your dbt dataset] # You can also use "schema" here - threads: [1 or more] - token: [temporary access token] # refreshed + updated by external process - [](#optional-configurations): + project: GCP_PROJECT_ID + dataset: DBT_DATASET_NAME # You can also use "schema" here + threads: 4 # Must be a value of 1 or greater + token: TEMPORARY_ACCESS_TOKEN # refreshed + updated by external process + [OPTIONAL_CONFIG](#optional-configurations): VALUE ``` - - - ### Service Account File @@ -163,11 +149,11 @@ my-bigquery-db: dev: type: bigquery method: service-account - project: [GCP project id] - dataset: [the name of your dbt dataset] - threads: [1 or more] - keyfile: [/path/to/bigquery/keyfile.json] - [](#optional-configurations): + project: GCP_PROJECT_ID + dataset: DBT_DATASET_NAME + threads: 4 # Must be a value of 1 or greater + keyfile: /PATH/TO/BIGQUERY/keyfile.json + [OPTIONAL_CONFIG](#optional-configurations): VALUE ``` @@ -191,10 +177,10 @@ my-bigquery-db: dev: type: bigquery method: service-account-json - project: [GCP project id] - dataset: [the name of your dbt dataset] - threads: [1 or more] - [](#optional-configurations): + project: GCP_PROJECT_ID + dataset: DBT_DATASET_NAME + threads: 4 # Must be a value of 1 or greater + [OPTIONAL_CONFIG](#optional-configurations): VALUE # These fields come from the service account json keyfile keyfile_json: @@ -233,8 +219,6 @@ my-profile: ### Timeouts and Retries - - The `dbt-bigquery` plugin uses the BigQuery Python client library to submit queries. Each query requires two steps: 1. Job creation: Submit the query job to BigQuery, and receive its job ID. 2. Job execution: Wait for the query job to finish executing, and receive its result. @@ -251,11 +235,17 @@ In older versions of `dbt-bigquery`, this same config was called `timeout_second ::: -No timeout is set by default. (For historical reasons, some query types use a default of 300 seconds when the `job_execution_timeout_seconds` configuration is not set.) When `job_execution_timeout_seconds` is set, if any dbt query, including a model's SQL transformation, takes longer than 300 seconds to complete, BigQuery might cancel the query and issue the following error: +No timeout is set by default. (For historical reasons, some query types use a default of 300 seconds when the `job_execution_timeout_seconds` configuration is not set). When you do set the `job_execution_timeout_seconds`, if any dbt query takes more than 300 seconds to finish, the dbt-bigquery adapter will run into an exception: ``` Operation did not complete within the designated timeout. ``` + +:::caution Note + +The `job_execution_timeout_seconds` represents the number of seconds to wait for the [underlying HTTP transport](https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result). It _doesn't_ represent the maximum allowable time for a BigQuery job itself. So, if dbt-bigquery ran into an exception at 300 seconds, the actual BigQuery job could still be running for the time set in BigQuery's own timeout settings. + +::: You can change the timeout seconds for the job execution step by configuring `job_execution_timeout_seconds` in the BigQuery profile: @@ -315,57 +305,6 @@ my-profile: - - - - -BigQuery supports query timeouts. By default, the timeout is set to 300 seconds. If a dbt model takes longer than this timeout to complete, then BigQuery may cancel the query and issue the following error: - -``` - Operation did not complete within the designated timeout. -``` - -To change this timeout, use the `timeout_seconds` configuration: - - - -```yaml -my-profile: - target: dev - outputs: - dev: - type: bigquery - method: oauth - project: abc-123 - dataset: my_dataset - timeout_seconds: 600 # 10 minutes -``` - - - -The `retries` profile configuration designates the number of times dbt should retry queries that result in unhandled server errors. This configuration is only specified for BigQuery targets. Example: - - - -```yaml -# This example target will retry BigQuery queries 5 -# times with a delay. If the query does not succeed -# after the fifth attempt, then dbt will raise an error - -my-profile: - target: dev - outputs: - dev: - type: bigquery - method: oauth - project: abc-123 - dataset: my_dataset - retries: 5 -``` - - - - ### Dataset locations @@ -387,12 +326,6 @@ my-profile: ### Maximum Bytes Billed - - -- New in dbt v0.17.0 - - - When a `maximum_bytes_billed` value is configured for a BigQuery profile, queries executed by dbt will fail if they exceed the configured maximum bytes threshhold. This configuration should be supplied as an integer number @@ -439,7 +372,6 @@ my-profile: ``` ### Service Account Impersonation -New in v0.18.0 This feature allows users authenticating via local OAuth to access BigQuery resources based on the permissions of a service account. @@ -461,7 +393,6 @@ For a general overview of this process, see the official docs for [Creating Shor ### Execution project -New in v0.21.0 By default, dbt will use the specified `project`/`database` as both: 1. The location to materialize resources (models, seeds, snapshots, etc), unless they specify a custom `project`/`database` config @@ -524,6 +455,7 @@ my-profile: dataproc_region: us-central1 submission_method: serverless dataproc_batch: + batch_id: MY_CUSTOM_BATCH_ID # Supported in v1.7+ environment_config: execution_config: service_account: dbt@abc-123.iam.gserviceaccount.com @@ -533,7 +465,7 @@ my-profile: role: dev runtime_config: properties: - spark.executor.instances: 3 + spark.executor.instances: "3" spark.driver.memory: 1g ``` diff --git a/website/docs/docs/core/connect-data-platform/databricks-setup.md b/website/docs/docs/core/connect-data-platform/databricks-setup.md index 0d24a3b04aa..caf52d09de3 100644 --- a/website/docs/docs/core/connect-data-platform/databricks-setup.md +++ b/website/docs/docs/core/connect-data-platform/databricks-setup.md @@ -31,8 +31,6 @@ meta:
  • Minimum data platform version: {frontMatter.meta.min_supported_version}
  • -## Installation and Distribution -

    Installing {frontMatter.meta.pypi_package}

    @@ -48,17 +46,27 @@ pip is the easiest way to install the adapter:

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    -`dbt-databricks` is the recommend adapter for Databricks - -`dbt-databricks` includes features not available in `dbt-spark`: +`dbt-databricks` is the recommended adapter for Databricks. It includes features not available in `dbt-spark`, such as: - Unity Catalog support - No need to install additional drivers or dependencies for use on the CLI - Use of Delta Lake for all models out of the box - SQL macros that are optimized to run with [Photon](https://docs.databricks.com/runtime/photon.html) -### Set up a Databricks Target +## Connecting to Databricks + +To connect to a data platform with dbt Core, create the appropriate _profile_ and _target_ YAML keys/values in the `profiles.yml` configuration file for your Databricks SQL Warehouse/cluster. This dbt YAML file lives in the `.dbt/` directory of your user/home directory. For more info, refer to [Connection profiles](/docs/core/connect-data-platform/connection-profiles) and [profiles.yml](/docs/core/connect-data-platform/profiles.yml). + +`dbt-databricks` can connect to Databricks SQL Warehouses and all-purpose clusters. Databricks SQL Warehouses is the recommended way to get started with Databricks. + +Refer to the [Databricks docs](https://docs.databricks.com/dev-tools/dbt.html#) for more info on how to obtain the credentials for configuring your profile. + +### Examples {#examples} + +You can use either token-based authentication or OAuth client-based authentication to connect to Databricks. Refer to the following examples for more info on how to configure your profile for each type of authentication. + + -dbt-databricks can connect to the Databricks SQL Warehouses and all-purpose clusters. Databricks SQL Warehouses is the recommended way to get started with Databricks. + @@ -69,19 +77,76 @@ your_profile_name: dev: type: databricks catalog: [optional catalog name if you are using Unity Catalog] - schema: [schema name] - host: [yourorg.databrickshost.com] - http_path: [/sql/your/http/path] - token: [dapiXXXXXXXXXXXXXXXXXXXXXXX] # Personal Access Token (PAT) - threads: [1 or more] # optional, default 1 + schema: [schema name] # Required + host: [yourorg.databrickshost.com] # Required + http_path: [/sql/your/http/path] # Required + token: [dapiXXXXXXXXXXXXXXXXXXXXXXX] # Required Personal Access Token (PAT) if using token-based authentication + threads: [1 or more] # Optional, default 1 ``` -See the [Databricks documentation](https://docs.databricks.com/dev-tools/dbt.html#) on how -to obtain the credentials for configuring your profile. + + + + + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: databricks + catalog: [optional catalog name if you are using Unity Catalog] + schema: [schema name] # Required + host: [yourorg.databrickshost.com] # Required + http_path: [/sql/your/http/path] # Required + auth_type: oauth # Required if using OAuth-based authentication + client_id: [OAuth-Client-ID] # The ID of your OAuth application. Required if using OAuth-based authentication + client_secret: [XXXXXXXXXXXXXXXXXXXXXXXXXXX] # OAuth client secret. # Required if using OAuth-based authentication + threads: [1 or more] # Optional, default 1 +``` + + + + + + +## Host parameters + +The following profile fields are always required. + +| Field | Description | Example | +| --------- | ------- | ----------- | +| `host` | The hostname of your cluster.

    Don't include the `http://` or `https://` prefix. | `yourorg.databrickshost.com` | +| `http_path` | The http path to your SQL Warehouse or all-purpose cluster. | `/sql/your/http/path` | +| `schema` | The name of a schema within your cluster's catalog.

    It's _not recommended_ to use schema names that have upper case or mixed case letters. | `my_schema` | + +## Authentication parameters + +The `dbt-databricks` adapter supports both [token-based authentication](/docs/core/connect-data-platform/databricks-setup?tokenoauth=token#examples) and [OAuth client-based authentication](/docs/core/connect-data-platform/databricks-setup?tokenoauth=oauth#examples). + +Refer to the following **required** parameters to configure your profile for each type of authentication: + +| Field | Authentication type | Description | Example | Authentication type | +| --------- | ------- | ----------- | ---- | +| `token` | Token-based | The Personal Access Token (PAT) to connect to Databricks. | `dapiXXXXXXXXX`
    `XXXXXXXXXXXXXX` | +| `client_id` | OAuth-based | The client ID for your Databricks OAuth application.
    | `` | +| `client_secret` | OAuth-based | The client secret for your Databricks OAuth application.
    | `XXXXXXXXXXXXX`
    `XXXXXXXXXXXXXX` | +| `auth_type` | OAuth-based | The type of authorization needed to connect to Databricks.
    | `oauth` | + +## Additional parameters +The following profile fields are optional to set up. They help you configure how your cluster's session and dbt work for your connection. +| Profile field | Description | Example | +| ------------- | ------------------- | --------------- | +| `threads` | The number of threads dbt should use (default is `1`) |`8` | +| `connect_retries` | The number of times dbt should retry the connection to Databricks (default is `1`) |`3` | +| `connect_timeout` | How many seconds before the connection to Databricks should timeout (default behavior is no timeouts) | `1000` | +| `session_properties` | This sets the Databricks session properties used in the connection. Execute `SET -v` to see available options |`ansi_mode: true` | ## Supported Functionality diff --git a/website/docs/docs/core/connect-data-platform/duckdb-setup.md b/website/docs/docs/core/connect-data-platform/duckdb-setup.md index 7896e4abeae..a3fee5a5164 100644 --- a/website/docs/docs/core/connect-data-platform/duckdb-setup.md +++ b/website/docs/docs/core/connect-data-platform/duckdb-setup.md @@ -4,7 +4,7 @@ description: "Read this guide to learn about the DuckDB warehouse setup in dbt." meta: maintained_by: Community authors: 'Josh Wills (https://github.com/jwills)' - github_repo: 'jwills/dbt-duckdb' + github_repo: 'duckdb/dbt-duckdb' pypi_package: 'dbt-duckdb' min_core_version: 'v1.0.1' cloud_support: Not Supported diff --git a/website/docs/docs/core/connect-data-platform/fabric-setup.md b/website/docs/docs/core/connect-data-platform/fabric-setup.md index aa7784d96ec..ef5a748552d 100644 --- a/website/docs/docs/core/connect-data-platform/fabric-setup.md +++ b/website/docs/docs/core/connect-data-platform/fabric-setup.md @@ -15,9 +15,9 @@ meta: :::info -Below is a guide for use with "Synapse Data Warehouse" a new product within Microsoft Fabric (preview) ([more info](https://learn.microsoft.com/en-us/fabric/data-warehouse/data-warehousing#synapse-data-warehouse)) +Below is a guide for use with [Synapse Data Warehouse](https://learn.microsoft.com/en-us/fabric/data-warehouse/data-warehousing#synapse-data-warehouse), a new product within Microsoft Fabric. -To learn how to set up dbt with Azure Synapse Dedicated Pools, see [Microsoft Azure Synapse DWH setup](/docs/core/connect-data-platform/azuresynapse-setup) +To learn how to set up dbt with Azure Synapse Dedicated Pools, refer to [Microsoft Azure Synapse DWH setup](/docs/core/connect-data-platform/azuresynapse-setup). ::: diff --git a/website/docs/docs/core/connect-data-platform/glue-setup.md b/website/docs/docs/core/connect-data-platform/glue-setup.md index e0fb9556853..e56e5bcd902 100644 --- a/website/docs/docs/core/connect-data-platform/glue-setup.md +++ b/website/docs/docs/core/connect-data-platform/glue-setup.md @@ -58,15 +58,14 @@ For further (and more likely up-to-date) info, see the [README](https://github.c ### Configuring your AWS profile for Glue Interactive Session There are two IAM principals used with interactive sessions. -- Client principal: The princpal (either user or role) calling the AWS APIs (Glue, Lake Formation, Interactive Sessions) -from the local client. This is the principal configured in the AWS CLI and likely the same. +- Client principal: The principal (either user or role) calling the AWS APIs (Glue, Lake Formation, Interactive Sessions) +from the local client. This is the principal configured in the AWS CLI and is likely the same. - Service role: The IAM role that AWS Glue uses to execute your session. This is the same as AWS Glue ETL. Read [this documentation](https://docs.aws.amazon.com/glue/latest/dg/glue-is-security.html) to configure these principals. - -You will find bellow a least privileged policy to enjoy all features of **`dbt-glue`** adapter. +You will find below a least privileged policy to enjoy all features of **`dbt-glue`** adapter. Please to update variables between **`<>`**, here are explanations of these arguments: @@ -74,7 +73,7 @@ Please to update variables between **`<>`**, here are explanations of these argu |---|---| |region|The region where your Glue database is stored | |AWS Account|The AWS account where you run your pipeline| -|dbt output database|The database updated by dbt (this is the database configured in the profile.yml of your dbt environment)| +|dbt output database|The database updated by dbt (this is the schema configured in the profile.yml of your dbt environment)| |dbt source database|All databases used as source| |dbt output bucket|The bucket name where the data will be generated by dbt (the location configured in the profile.yml of your dbt environment)| |dbt source bucket|The bucket name of source databases (if they are not managed by Lake Formation)| @@ -113,9 +112,19 @@ Please to update variables between **`<>`**, here are explanations of these argu "glue:BatchDeleteTableVersion", "glue:BatchDeleteTable", "glue:DeletePartition", + "glue:GetUserDefinedFunctions", "lakeformation:ListResources", "lakeformation:BatchGrantPermissions", - "lakeformation:ListPermissions" + "lakeformation:ListPermissions", + "lakeformation:GetDataAccess", + "lakeformation:GrantPermissions", + "lakeformation:RevokePermissions", + "lakeformation:BatchRevokePermissions", + "lakeformation:AddLFTagsToResource", + "lakeformation:RemoveLFTagsFromResource", + "lakeformation:GetResourceLFTags", + "lakeformation:ListLFTags", + "lakeformation:GetLFTag", ], "Resource": [ "arn:aws:glue:::catalog", @@ -189,7 +198,7 @@ Please to update variables between **`<>`**, here are explanations of these argu ### Configuration of the local environment -Because **`dbt`** and **`dbt-glue`** adapter are compatible with Python versions 3.8, and 3.9, check the version of Python: +Because **`dbt`** and **`dbt-glue`** adapters are compatible with Python versions 3.7, 3.8, and 3.9, check the version of Python: ```bash $ python3 --version @@ -212,12 +221,17 @@ $ unzip awscliv2.zip $ sudo ./aws/install ``` -Configure the aws-glue-session package +Install boto3 package ```bash $ sudo yum install gcc krb5-devel.x86_64 python3-devel.x86_64 -y $ pip3 install —upgrade boto3 -$ pip3 install —upgrade aws-glue-sessions +``` + +Install the package: + +```bash +$ pip3 install dbt-glue ``` ### Example config @@ -232,7 +246,6 @@ workers: 2 worker_type: G.1X idle_timeout: 10 schema: "dbt_demo" -database: "dbt_demo" session_provisioning_timeout_in_seconds: 120 location: "s3://dbt_demo_bucket/dbt_demo_data" ``` @@ -241,24 +254,788 @@ location: "s3://dbt_demo_bucket/dbt_demo_data" The table below describes all the options. -|Option |Description | Mandatory | -|---|---|---| -|project_name |The dbt project name. This must be the same as the one configured in the dbt project. |yes| -|type |The driver to use. |yes| -|query-comment |A string to inject as a comment in each query that dbt runs. |no| -|role_arn |The ARN of the interactive session role created as part of the CloudFormation template. |yes| -|region |The AWS Region where you run the data pipeline. |yes| -|workers |The number of workers of a defined workerType that are allocated when a job runs. |yes| -|worker_type |The type of predefined worker that is allocated when a job runs. Accepts a value of Standard, G.1X, or G.2X. |yes| -|schema |The schema used to organize data stored in Amazon S3. |yes| -|database |The database in Lake Formation. The database stores metadata tables in the Data Catalog. |yes| -|session_provisioning_timeout_in_seconds |The timeout in seconds for AWS Glue interactive session provisioning. |yes| -|location |The Amazon S3 location of your target data. |yes| -|idle_timeout |The AWS Glue session idle timeout in minutes. (The session stops after being idle for the specified amount of time.) |no| -|glue_version |The version of AWS Glue for this session to use. Currently, the only valid options are 2.0 and 3.0. The default value is 2.0. |no| -|security_configuration |The security configuration to use with this session. |no| -|connections |A comma-separated list of connections to use in the session. |no| +| Option | Description | Mandatory | +|-----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| +| project_name | The dbt project name. This must be the same as the one configured in the dbt project. | yes | +| type | The driver to use. | yes | +| query-comment | A string to inject as a comment in each query that dbt runs. | no | +| role_arn | The ARN of the glue interactive session IAM role. | yes | +| region | The AWS Region where you run the data pipeline. | yes | +| workers | The number of workers of a defined workerType that are allocated when a job runs. | yes | +| worker_type | The type of predefined worker that is allocated when a job runs. Accepts a value of Standard, G.1X, or G.2X. | yes | +| schema | The schema used to organize data stored in Amazon S3.Additionally, is the database in AWS Lake Formation that stores metadata tables in the Data Catalog. | yes | +| session_provisioning_timeout_in_seconds | The timeout in seconds for AWS Glue interactive session provisioning. | yes | +| location | The Amazon S3 location of your target data. | yes | +| query_timeout_in_minutes | The timeout in minutes for a single query. Default is 300 | no | +| idle_timeout | The AWS Glue session idle timeout in minutes. (The session stops after being idle for the specified amount of time) | no | +| glue_version | The version of AWS Glue for this session to use. Currently, the only valid options are 2.0 and 3.0. The default value is 3.0. | no | +| security_configuration | The security configuration to use with this session. | no | +| connections | A comma-separated list of connections to use in the session. | no | +| conf | Specific configuration used at the startup of the Glue Interactive Session (arg --conf) | no | +| extra_py_files | Extra python Libs that can be used by the interactive session. | no | +| delta_athena_prefix | A prefix used to create Athena-compatible tables for Delta tables (if not specified, then no Athena-compatible table will be created) | no | +| tags | The map of key-value pairs (tags) belonging to the session. Ex: `KeyName1=Value1,KeyName2=Value2` | no | +| seed_format | By default `parquet`, can be Spark format compatible like `csv` or `json` | no | +| seed_mode | By default `overwrite`, the seed data will be overwritten, you can set it to `append` if you just want to add new data in your dataset | no | +| default_arguments | The map of key-value pairs parameters belonging to the session. More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html). Ex: `--enable-continuous-cloudwatch-log=true,--enable-continuous-log-filter=true` | no | +| glue_session_id | re-use the glue-session to run multiple dbt run commands: set a glue session id you need to use | no | +| glue_session_reuse | Reuse the glue-session to run multiple dbt run commands: If set to true, the glue session will not be closed for re-use. If set to false, the session will be closed | no | +| datalake_formats | The ACID data lake format that you want to use if you are doing merge, can be `hudi`, `ìceberg` or `delta` |no| + +## Configs + +### Configuring tables + +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-spark plugin, in addition to the standard [model configs](/reference/model-configs). + +| Option | Description | Required? | Example | +|---------|----------------------------------------------------|-------------------------|--------------------------| +| file_format | The file format to use when creating tables (`parquet`, `csv`, `json`, `text`, `jdbc` or `orc`). | Optional | `parquet`| +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | +| custom_location | By default, the adapter will store your data in the following path: `location path`/`schema`/`table`. If you don't want to follow that default behaviour, you can use this parameter to set your own custom location on S3 | No | `s3://mycustombucket/mycustompath` | +| hudi_options | When using file_format `hudi`, gives the ability to overwrite any of the default configuration options. | Optional | `{'hoodie.schema.on.read.enable': 'true'}` | +## Incremental models + +dbt seeks to offer useful and intuitive modeling abstractions by means of its built-in configurations and materializations. + +For that reason, the dbt-glue plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values: + - **`append`** (default): Insert new records without updating or overwriting any existing data. + - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the table with new data. If no `partition_by` is specified, overwrite the entire table with new data. + - **`merge`** (Apache Hudi and Apache Iceberg only): Match records based on a `unique_key`; update old records, and insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + +Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. + +**Notes:** +The default strategy is **`insert_overwrite`** + +### The `append` strategy + +Following the `append` strategy, dbt will perform an `insert into` statement with all new data. The appeal of this strategy is that it is straightforward and functional across all platforms, file types, connection methods, and Apache Spark versions. However, this strategy _cannot_ update, overwrite, or delete existing data, so it is likely to insert duplicate records for many data sources. + +#### Source code +```sql +{{ config( + materialized='incremental', + incremental_strategy='append', +) }} + +-- All rows returned by this query will be appended to the existing table + +select * from {{ ref('events') }} +{% if is_incremental() %} + where event_ts > (select max(event_ts) from {{ this }}) +{% endif %} +``` +#### Run Code +```sql +create temporary view spark_incremental__dbt_tmp as + + select * from analytics.events + + where event_ts >= (select max(event_ts) from {{ this }}) + +; + +insert into table analytics.spark_incremental + select `date_day`, `users` from spark_incremental__dbt_tmp +``` + +### The `insert_overwrite` strategy + +This strategy is most effective when specified alongside a `partition_by` clause in your model config. dbt will run an [atomic `insert overwrite` statement](https://spark.apache.org/docs/latest/sql-ref-syntax-dml-insert-overwrite-table.html) that dynamically replaces all partitions included in your query. Be sure to re-select _all_ of the relevant data for a partition when using this incremental strategy. + +If no `partition_by` is specified, then the `insert_overwrite` strategy will atomically replace all contents of the table, overriding all existing data with only the new records. The column schema of the table remains the same, however. This can be desirable in some limited circumstances since it minimizes downtime while the table contents are overwritten. The operation is comparable to running `truncate` + `insert` on other databases. For atomic replacement of Delta-formatted tables, use the `table` materialization (which runs `create or replace`) instead. + +#### Source Code +```sql +{{ config( + materialized='incremental', + partition_by=['date_day'], + file_format='parquet' +) }} + +/* + Every partition returned by this query will be overwritten + when this model runs +*/ + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + date_day, + count(*) as users + +from events +group by 1 +``` + +#### Run Code + +```sql +create temporary view spark_incremental__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + date_day, + count(*) as users + + from events + group by 1 + +; + +insert overwrite table analytics.spark_incremental + partition (date_day) + select `date_day`, `users` from spark_incremental__dbt_tmp +``` + +Specifying `insert_overwrite` as the incremental strategy is optional since it's the default strategy used when none is specified. + +### The `merge` strategy + +**Compatibility:** +- Hudi : OK +- Delta Lake : OK +- Iceberg : OK +- Lake Formation Governed Tables : On going + +NB: + +- For Glue 3: you have to set up a [Glue connectors](https://docs.aws.amazon.com/glue/latest/ug/connectors-chapter.html). + +- For Glue 4: use the `datalake_formats` option in your profile.yml + +When using a connector be sure that your IAM role has these policies: +``` +{ + "Sid": "access_to_connections", + "Action": [ + "glue:GetConnection", + "glue:GetConnections" + ], + "Resource": [ + "arn:aws:glue:::catalog", + "arn:aws:glue:::connection/*" + ], + "Effect": "Allow" +} +``` +and that the managed policy `AmazonEC2ContainerRegistryReadOnly` is attached. +Be sure that you follow the getting started instructions [here](https://docs.aws.amazon.com/glue/latest/ug/setting-up.html#getting-started-min-privs-connectors). + + +This [blog post](https://aws.amazon.com/blogs/big-data/part-1-integrate-apache-hudi-delta-lake-apache-iceberg-datasets-at-scale-aws-glue-studio-notebook/) also explains how to set up and works with Glue Connectors + +#### Hudi + +**Usage notes:** The `merge` with Hudi incremental strategy requires: +- To add `file_format: hudi` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: hudi` + - Alternatively, to add a connection in your profile: `connections: name_of_your_hudi_connector` +- To add Kryo serializer in your Interactive Session Config (in your profile): `conf: spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false` + +dbt will run an [atomic `merge` statement](https://hudi.apache.org/docs/writing_data#spark-datasource-writer) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + conf: spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false + datalake_formats: hudi +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key='user_id', + file_format='hudi', + hudi_options={ + 'hoodie.datasource.write.precombine.field': 'eventtime', + } +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + +#### Delta + +You can also use Delta Lake to be able to use merge feature on tables. + +**Usage notes:** The `merge` with Delta incremental strategy requires: +- To add `file_format: delta` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: delta` + - Alternatively, to add a connection in your profile: `connections: name_of_your_delta_connector` +- To add the following config in your Interactive Session Config (in your profile): `conf: "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` + +**Athena:** Athena is not compatible by default with delta tables, but you can configure the adapter to create Athena tables on top of your delta table. To do so, you need to configure the two following options in your profile: +- For Delta Lake 2.1.0 supported natively in Glue 4.0: `extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-2.1.0.jar"` +- For Delta Lake 1.0.0 supported natively in Glue 3.0: `extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-1.0.0.jar"` +- `delta_athena_prefix: "the_prefix_of_your_choice"` +- If your table is partitioned, then the addition of new partition is not automatic, you need to perform an `MSCK REPAIR TABLE your_delta_table` after each new partition adding + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + datalake_formats: delta + conf: "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" + extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-2.1.0.jar" + delta_athena_prefix: "delta" +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key='user_id', + partition_by=['dt'], + file_format='delta' +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen, + current_date() as dt + +from events +group by 1 +``` + +#### Iceberg + +**Usage notes:** The `merge` with Iceberg incremental strategy requires: +- To attach the AmazonEC2ContainerRegistryReadOnly Manged policy to your execution role : +- To add the following policy to your execution role to enable commit locking in a dynamodb table (more info [here](https://iceberg.apache.org/docs/latest/aws/#dynamodb-lock-manager)). Note that the DynamoDB table specified in the resource field of this policy should be the one that is mentioned in your dbt profiles (`--conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable`). By default, this table is named `myGlueLockTable` and is created automatically (with On-Demand Pricing) when running a dbt-glue model with Incremental Materialization and Iceberg file format. If you want to name the table differently or to create your own table without letting Glue do it on your behalf, please provide the `iceberg_glue_commit_lock_table` parameter with your table name (eg. `MyDynamoDbTable`) in your dbt profile. +```yaml +iceberg_glue_commit_lock_table: "MyDynamoDbTable" +``` +- the latest connector for iceberg in AWS marketplace uses Ver 0.14.0 for Glue 3.0, and Ver 1.2.1 for Glue 4.0 where Kryo serialization fails when writing iceberg, use "org.apache.spark.serializer.JavaSerializer" for spark.serializer instead, more info [here](https://github.com/apache/iceberg/pull/546) + +Make sure you update your conf with `--conf spark.sql.catalog.glue_catalog.lock.table=` and, you change the below iam permission with your correct table name. +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CommitLockTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:BatchGetItem", + "dynamodb:BatchWriteItem", + "dynamodb:ConditionCheckItem", + "dynamodb:PutItem", + "dynamodb:DescribeTable", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan", + "dynamodb:Query", + "dynamodb:UpdateItem" + ], + "Resource": "arn:aws:dynamodb:::table/myGlueLockTable" + } + ] +} +``` +- To add `file_format: Iceberg` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: iceberg` + - Alternatively, to add connections in your profile: `connections: name_of_your_iceberg_connector` ( + - For Athena version 3: + - The adapter is compatible with the Iceberg Connector from AWS Marketplace with Glue 3.0 as Fulfillment option and 0.14.0 (Oct 11, 2022) as Software version) + - the latest connector for iceberg in AWS marketplace uses Ver 0.14.0 for Glue 3.0, and Ver 1.2.1 for Glue 4.0 where Kryo serialization fails when writing iceberg, use "org.apache.spark.serializer.JavaSerializer" for spark.serializer instead, more info [here](https://github.com/apache/iceberg/pull/546) + - For Athena version 2: The adapter is compatible with the Iceberg Connector from AWS Marketplace with Glue 3.0 as Fulfillment option and 0.12.0-2 (Feb 14, 2022) as Software version) +- To add the following config in your Interactive Session Config (in your profile): +```--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer + --conf spark.sql.warehouse=s3:// + --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog + --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog + --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO + --conf spark.sql.catalog.glue_catalog.lock-impl=org.apache.iceberg.aws.dynamodb.DynamoDbLockManager + --conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + - For Glue 3.0, set `spark.sql.catalog.glue_catalog.lock-impl` to `org.apache.iceberg.aws.glue.DynamoLockManager` instead + +dbt will run an [atomic `merge` statement](https://iceberg.apache.org/docs/latest/spark-writes/) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. You need to provide a `unique_key` to perform merge operation otherwise it will fail. This key is to provide in a Python list format and can contains multiple column name to create a composite unique_key. + +##### Notes +- When using a custom_location in Iceberg, avoid to use final trailing slash. Adding a final trailing slash lead to an un-proper handling of the location, and issues when reading the data from query engines like Trino. The issue should be fixed for Iceberg version > 0.13. Related Github issue can be find [here](https://github.com/apache/iceberg/issues/4582). +- Iceberg also supports `insert_overwrite` and `append` strategies. +- The `warehouse` conf must be provided, but it's overwritten by the adapter `location` in your profile or `custom_location` in model configuration. +- By default, this materialization has `iceberg_expire_snapshots` set to 'True', if you need to have historical auditable changes, set: `iceberg_expire_snapshots='False'`. +- Currently, due to some dbt internal, the iceberg catalog used internally when running glue interactive sessions with dbt-glue has a hardcoded name `glue_catalog`. This name is an alias pointing to the AWS Glue Catalog but is specific to each session. If you want to interact with your data in another session without using dbt-glue (from a Glue Studio notebook, for example), you can configure another alias (ie. another name for the Iceberg Catalog). To illustrate this concept, you can set in your configuration file : +``` +--conf spark.sql.catalog.RandomCatalogName=org.apache.iceberg.spark.SparkCatalog +``` +And then run in an AWS Glue Studio Notebook a session with the following config: +``` +--conf spark.sql.catalog.AnotherRandomCatalogName=org.apache.iceberg.spark.SparkCatalog +``` +In both cases, the underlying catalog would be the AWS Glue Catalog, unique in your AWS Account and Region, and you would be able to work with the exact same data. Also make sure that if you change the name of the Glue Catalog Alias, you change it in all the other `--conf` where it's used: +``` + --conf spark.sql.catalog.RandomCatalogName=org.apache.iceberg.spark.SparkCatalog + --conf spark.sql.catalog.RandomCatalogName.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog + ... + --conf spark.sql.catalog.RandomCatalogName.lock-impl=org.apache.iceberg.aws.glue.DynamoLockManager +``` +- A full reference to `table_properties` can be found [here](https://iceberg.apache.org/docs/latest/configuration/). +- Iceberg Tables are natively supported by Athena. Therefore, you can query tables created and operated with dbt-glue adapter from Athena. +- Incremental Materialization with Iceberg file format supports dbt snapshot. You are able to run a dbt snapshot command that queries an Iceberg Table and create a dbt fashioned snapshot of it. + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + datalake_formats: iceberg + conf: --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.warehouse=s3://aws-dbt-glue-datalake-1234567890-eu-west-1/dbt_test_project --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.lock-impl=org.apache.iceberg.aws.dynamodb.DynamoDbLockManager --conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key=['user_id'], + file_format='iceberg', + iceberg_expire_snapshots='False', + partition_by=['status'] + table_properties={'write.target-file-size-bytes': '268435456'} +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` +#### Iceberg Snapshot source code example +```sql + +{% snapshot demosnapshot %} + +{{ + config( + strategy='timestamp', + target_schema='jaffle_db', + updated_at='dt', + file_format='iceberg' +) }} + +select * from {{ ref('customers') }} + +{% endsnapshot %} + +``` + +## Monitoring your Glue Interactive Session + +Monitoring is an important part of maintaining the reliability, availability, +and performance of AWS Glue and your other AWS solutions. AWS provides monitoring +tools that you can use to watch AWS Glue, identify the required number of workers +required for your Glue Interactive Session, report when something is wrong and +take action automatically when appropriate. AWS Glue provides Spark UI, +and CloudWatch logs and metrics for monitoring your AWS Glue jobs. +More information on: [Monitoring AWS Glue Spark jobs](https://docs.aws.amazon.com/glue/latest/dg/monitor-spark.html) + +**Usage notes:** Monitoring requires: +- To add the following IAM policy to your IAM role: +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudwatchMetrics", + "Effect": "Allow", + "Action": "cloudwatch:PutMetricData", + "Resource": "*", + "Condition": { + "StringEquals": { + "cloudwatch:namespace": "Glue" + } + } + }, + { + "Sid": "CloudwatchLogs", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "logs:CreateLogStream", + "logs:CreateLogGroup", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:*:*:/aws-glue/*", + "arn:aws:s3:::bucket-to-write-sparkui-logs/*" + ] + } + ] +} +``` + +- To add monitoring parameters in your Interactive Session Config (in your profile). +More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + default_arguments: "--enable-metrics=true, --enable-continuous-cloudwatch-log=true, --enable-continuous-log-filter=true, --enable-spark-ui=true, --spark-event-logs-path=s3://bucket-to-write-sparkui-logs/dbt/" +``` + +If you want to use the Spark UI, you can launch the Spark history server using a +AWS CloudFormation template that hosts the server on an EC2 instance, +or launch locally using Docker. More information on [Launching the Spark history server](https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-history.html#monitor-spark-ui-history-local) + +## Enabling AWS Glue Auto Scaling +Auto Scaling is available since AWS Glue version 3.0 or later. More information +on the following AWS blog post: ["Introducing AWS Glue Auto Scaling: Automatically resize serverless computing resources for lower cost with optimized Apache Spark"](https://aws.amazon.com/blogs/big-data/introducing-aws-glue-auto-scaling-automatically-resize-serverless-computing-resources-for-lower-cost-with-optimized-apache-spark/) + +With Auto Scaling enabled, you will get the following benefits: + +* AWS Glue automatically adds and removes workers from the cluster depending on the parallelism at each stage or microbatch of the job run. + +* It removes the need for you to experiment and decide on the number of workers to assign for your AWS Glue Interactive sessions. + +* Once you choose the maximum number of workers, AWS Glue will choose the right size resources for the workload. +* You can see how the size of the cluster changes during the Glue Interactive sessions run by looking at CloudWatch metrics. +More information on [Monitoring your Glue Interactive Session](#Monitoring-your-Glue-Interactive-Session). + +**Usage notes:** AWS Glue Auto Scaling requires: +- To set your AWS Glue version 3.0 or later. +- To set the maximum number of workers (if Auto Scaling is enabled, the `workers` +parameter sets the maximum number of workers) +- To set the `--enable-auto-scaling=true` parameter on your Glue Interactive Session Config (in your profile). +More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "3.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + default_arguments: "--enable-auto-scaling=true" +``` + +## Access Glue catalog in another AWS account +In many cases, you may need to run you dbt jobs to read from another AWS account. + +Review the following link https://repost.aws/knowledge-center/glue-tables-cross-accounts to set up access policies in source and target accounts + +Add the following `"spark.hadoop.hive.metastore.glue.catalogid="` to your conf in the DBT profile, as such, you can have multiple outputs for each of the accounts that you have access to. + +Note: The access cross-accounts need to be within the same AWS Region +#### Profile config example +```yaml +test_project: + target: dev + outputsAccountB: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "3.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + conf: "--conf hive.metastore.client.factory.class=com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory + --conf spark.hadoop.hive.metastore.glue.catalogid=" +``` + +## Persisting model descriptions + +Relation-level docs persistence is supported since dbt v0.17.0. For more +information on configuring docs persistence, see [the docs](/reference/resource-configs/persist_docs). + +When the `persist_docs` option is configured appropriately, you'll be able to +see model descriptions in the `Comment` field of `describe [table] extended` +or `show table extended in [database] like '*'`. + +## Always `schema`, never `database` + +Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands +`database` to exist at a higher level than `schema`. As such, you should _never_ +use or set `database` as a node config or in the target profile when running dbt-glue. + +If you want to control the schema/database in which dbt will materialize models, +use the `schema` config and `generate_schema_name` macro _only_. +For more information, check the dbt documentation about [custom schemas](https://docs.getdbt.com/docs/build/custom-schemas). + +## AWS Lakeformation integration +The adapter supports AWS Lake Formation tags management enabling you to associate existing tags defined out of dbt-glue to database objects built by dbt-glue (database, table, view, snapshot, incremental models, seeds). + +- You can enable or disable lf-tags management via config, at model and dbt-project level (disabled by default) +- If enabled, lf-tags will be updated on every dbt run. There are table level lf-tags configs and column-level lf-tags configs. +- You can specify that you want to drop existing database, table column Lake Formation tags by setting the drop_existing config field to True (False by default, meaning existing tags are kept) +- Please note that if the tag you want to associate with the table does not exist, the dbt-glue execution will throw an error + +The adapter also supports AWS Lakeformation data cell filtering. +- You can enable or disable data-cell filtering via config, at model and dbt-project level (disabled by default) +- If enabled, data_cell_filters will be updated on every dbt run. +- You can specify that you want to drop existing table data-cell filters by setting the drop_existing config field to True (False by default, meaning existing filters are kept) +- You can leverage excluded_columns_names **OR** columns config fields to perform Column level security as well. **Please note that you can use one or the other but not both**. +- By default, if you don't specify any column or excluded_columns, dbt-glue does not perform Column level filtering and let the principal access all the columns. + +The below configuration let the specified principal (lf-data-scientist IAM user) access rows that have a customer_lifetime_value > 15 and all the columns specified ('customer_id', 'first_order', 'most_recent_order', 'number_of_orders') + +```sql +lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'column_names': ['customer_id', 'first_order', 'most_recent_order', 'number_of_orders'] + } + }, + } + } +``` +The below configuration let the specified principal (lf-data-scientist IAM user) access rows that have a customer_lifetime_value > 15 and all the columns *except* the one specified ('first_name') + +```sql +lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'excluded_column_names': ['first_name'] + } + }, + } + } +``` + +See below some examples of how you can integrate LF Tags management and data cell filtering to your configurations : + +#### At model level +This way of defining your Lakeformation rules is appropriate if you want to handle the tagging and filtering policy at object level. Remember that it overrides any configuration defined at dbt-project level. + +```sql +{{ config( + materialized='incremental', + unique_key="customer_id", + incremental_strategy='append', + lf_tags_config={ + 'enabled': true, + 'drop_existing' : False, + 'tags_database': + { + 'name_of_my_db_tag': 'value_of_my_db_tag' + }, + 'tags_table': + { + 'name_of_my_table_tag': 'value_of_my_table_tag' + }, + 'tags_columns': { + 'name_of_my_lf_tag': { + 'value_of_my_tag': ['customer_id', 'customer_lifetime_value', 'dt'] + }}}, + lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'excluded_column_names': ['first_name'] + } + }, + } + } +) }} + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order, + customer_orders.most_recent_order, + customer_orders.number_of_orders, + customer_payments.total_amount as customer_lifetime_value, + current_date() as dt + + from customers + + left join customer_orders using (customer_id) + + left join customer_payments using (customer_id) + +``` + +#### At dbt-project level +This way you can specify tags and data filtering policy for a particular path in your dbt project (eg. models, seeds, models/model_group1, etc.) +This is especially useful for seeds, for which you can't define configuration in the file directly. + +```yml +seeds: + +lf_tags_config: + enabled: true + tags_table: + name_of_my_table_tag: 'value_of_my_table_tag' + tags_database: + name_of_my_database_tag: 'value_of_my_database_tag' +models: + +lf_tags_config: + enabled: true + drop_existing: True + tags_database: + name_of_my_database_tag: 'value_of_my_database_tag' + tags_table: + name_of_my_table_tag: 'value_of_my_table_tag' +``` + +## Tests + +To perform a functional test: +1. Install dev requirements: +```bash +$ pip3 install -r dev-requirements.txt +``` + +2. Install dev locally +```bash +$ python3 setup.py build && python3 setup.py install_lib +``` + +3. Export variables +```bash +$ export DBT_S3_LOCATION=s3://mybucket/myprefix +$ export DBT_ROLE_ARN=arn:aws:iam::1234567890:role/GlueInteractiveSessionRole +``` + +4. Run the test +```bash +$ python3 -m pytest tests/functional +``` + +For more information, check the dbt documentation about [testing a new adapter](https://docs.getdbt.com/docs/contributing/testing-a-new-adapter). ## Caveats @@ -269,6 +1046,7 @@ Most dbt Core functionality is supported, but some features are only available w Apache Hudi-only features: 1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](/reference/resource-configs/glue-configs#the-merge-strategy)) + Some dbt features, available on the core adapters, are not yet supported on Glue: 1. [Persisting](/reference/resource-configs/persist_docs) column-level descriptions as database comments 2. [Snapshots](/docs/build/snapshots) diff --git a/website/docs/docs/core/connect-data-platform/oracle-setup.md b/website/docs/docs/core/connect-data-platform/oracle-setup.md index f601709654b..b1195fbd0a0 100644 --- a/website/docs/docs/core/connect-data-platform/oracle-setup.md +++ b/website/docs/docs/core/connect-data-platform/oracle-setup.md @@ -455,27 +455,6 @@ dbt_test:
    - - - -```yaml -dbt_test: - target: "{{ env_var('DBT_TARGET', 'dev') }}" - outputs: - dev: - type: oracle - user: "{{ env_var('DBT_ORACLE_USER') }}" - pass: "{{ env_var('DBT_ORACLE_PASSWORD') }}" - protocol: "tcps" - host: "{{ env_var('DBT_ORACLE_HOST') }}" - port: 1522 - service: "{{ env_var('DBT_ORACLE_SERVICE') }}" - database: "{{ env_var('DBT_ORACLE_DATABASE') }}" - schema: "{{ env_var('DBT_ORACLE_SCHEMA') }}" - threads: 4 -``` - -
    diff --git a/website/docs/docs/core/connect-data-platform/postgres-setup.md b/website/docs/docs/core/connect-data-platform/postgres-setup.md index 5d7467c786d..f56d3f22576 100644 --- a/website/docs/docs/core/connect-data-platform/postgres-setup.md +++ b/website/docs/docs/core/connect-data-platform/postgres-setup.md @@ -88,33 +88,23 @@ The `search_path` config controls the Postgres "search path" that dbt configures #### role - Added in v0.16.0 - The `role` config controls the Postgres role that dbt assumes when opening new connections to the database. #### sslmode - Added in v0.16.0 - The `sslmode` config controls how dbt connectes to Postgres databases using SSL. See [the Postgres docs](https://www.postgresql.org/docs/9.1/libpq-ssl.html) on `sslmode` for usage information. When unset, dbt will connect to databases using the Postgres default, `prefer`, as the `sslmode`. #### sslcert - Added in v0.21.0 - The `sslcert` config controls the location of the certificate file used to connect to Postgres when using client SSL connections. To use a certificate file that is not in the default location, set that file path using this value. Without this config set, dbt uses the Postgres default locations. See [Client Certificates](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT) in the Postgres SSL docs for the default paths. #### sslkey - Added in v0.21.0 - The `sslkey` config controls the location of the private key for connecting to Postgres using client SSL connections. If this config is omitted, dbt uses the default key location for Postgres. See [Client Certificates](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT) in the Postgres SSL docs for the default locations. #### sslrootcert - Added in v0.21.0 - When connecting to a Postgres server using a client SSL connection, dbt verifies that the server provides an SSL certificate signed by a trusted root certificate. These root certificates are in the `~/.postgresql/root.crt` file by default. To customize the location of this file, set the `sslrootcert` config value to a new file path. ### `keepalives_idle` diff --git a/website/docs/docs/core/connect-data-platform/profiles.yml.md b/website/docs/docs/core/connect-data-platform/profiles.yml.md index 67b0eb15fbe..97254dda1c4 100644 --- a/website/docs/docs/core/connect-data-platform/profiles.yml.md +++ b/website/docs/docs/core/connect-data-platform/profiles.yml.md @@ -3,7 +3,7 @@ title: "About profiles.yml" id: profiles.yml --- -If you're using dbt from the [command line (CLI)](/docs/core/about-the-cli), you'll need a `profiles.yml` file that contains the connection details for your data platform. When you run dbt from the CLI, it reads your `dbt_project.yml` file to find the `profile` name, and then looks for a profile with the same name in your `profiles.yml` file. This profile contains all the information dbt needs to connect to your data platform. +If you're using [dbt Core](/docs/core/about-dbt-core), you'll need a `profiles.yml` file that contains the connection details for your data platform. When you run dbt Core from the command line, it reads your `dbt_project.yml` file to find the `profile` name, and then looks for a profile with the same name in your `profiles.yml` file. This profile contains all the information dbt needs to connect to your data platform. For detailed info, you can refer to the [Connection profiles](/docs/core/connect-data-platform/connection-profiles). diff --git a/website/docs/docs/core/connect-data-platform/redshift-setup.md b/website/docs/docs/core/connect-data-platform/redshift-setup.md index a86bc7df849..006f026ea94 100644 --- a/website/docs/docs/core/connect-data-platform/redshift-setup.md +++ b/website/docs/docs/core/connect-data-platform/redshift-setup.md @@ -46,10 +46,59 @@ pip is the easiest way to install the adapter:

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}.

    +## Configurations -## Authentication Methods +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `type` | redshift | The type of data warehouse you are connecting to| +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `port` | 5439 | | +| `dbname` | my_db | Database name| +| `schema` | my_schema | Schema name| +| `connect_timeout` | `None` or 30 | Number of seconds before connection times out| +| `sslmode` | prefer | optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. For more information on `sslmode`, see Redshift note below| +| `role` | None | Optional, user identifier of the current session| +| `autocreate` | false | Optional, default false. Creates user if they do not exist | +| `db_groups` | ['ANALYSTS'] | Optional. A list of existing database group names that the DbUser joins for the current session | +| `ra3_node` | true | Optional, default False. Enables cross-database sources| +| `autocommit` | true | Optional, default True. Enables autocommit after each statement| +| `retries` | 1 | Number of retries | -### Password-based authentication + +## Authentication Parameters + +The authentication methods that dbt Core supports are: + +- `database` — Password-based authentication (default, will be used if `method` is not provided) +- `IAM` — IAM + +For dbt Cloud users, log in using the default **Database username** and **password**. This is necessary because dbt Cloud does not support `IAM` authentication. + +Click on one of these authentication methods for further details on how to configure your connection profile. Each tab also includes an example `profiles.yml` configuration file for you to review. + + + + + +The following table contains the parameters for the database (password-based) connection method. + + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` | database| Leave this parameter unconfigured, or set this to database | +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `user` | username | Account username to log into your cluster | +| `password` | password1 | Password for authentication | + +
    + +#### Example profiles.yml for database authentication @@ -62,26 +111,29 @@ company-name: host: hostname.region.redshift.amazonaws.com user: username password: password1 - port: 5439 dbname: analytics schema: analytics + port: 5439 + + # Optional Redshift configs: + sslmode: prefer + role: None + ra3_node: true + autocommit: true threads: 4 - connect_timeout: None # optional, number of seconds before connection times out - # search_path: public # optional, not recommended - sslmode: prefer # optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. - role: # optional - ra3_node: true # enables cross-database sources - autocommit: true # enables autocommit after each statement - region: # optional + connect_timeout: None + ``` -### IAM Authentication +
    -To set up a Redshift profile using IAM Authentication, set the `method` -parameter to `iam` as shown below. Note that a password is not required when -using IAM Authentication. For more information on this type of authentication, + + +The following table lists the authentication parameters to use IAM authentication. + +To set up a Redshift profile using IAM Authentication, set the `method` parameter to `iam` as shown below. Note that a password is not required when using IAM Authentication. For more information on this type of authentication, consult the [Redshift Documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/generating-user-credentials.html) and [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.get_cluster_credentials) @@ -92,10 +144,25 @@ Authentication, then your aws credentials are likely misconfigured. Try running `aws configure` to set up AWS access keys, and pick a default region. If you have any questions, please refer to the official AWS documentation on [Configuration and credential file settings](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html). + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` |IAM| use IAM to authenticate | +| `iam_profile` | analyst | dbt will use the specified profile from your ~/.aws/config file | +| `cluster_id` | CLUSTER_ID| Required for IAM | +| `user` | username | Account user to log into your cluster | +| `region` | us-east-1 | Required for IAM authentication | + + +
    + + +#### Example profiles.yml for IAM + ```yaml -my-redshift-db: + my-redshift-db: target: dev outputs: dev: @@ -104,26 +171,32 @@ my-redshift-db: cluster_id: CLUSTER_ID host: hostname.region.redshift.amazonaws.com user: alice - iam_profile: data_engineer # optional - autocreate: true # optional - db_groups: ['ANALYSTS'] # optional - - # Other Redshift configs: - port: 5439 + iam_profile: analyst dbname: analytics schema: analytics + port: 5439 + + # Optional Redshift configs: threads: 4 - connect_timeout: None # optional, number of seconds before connection times out - [retries](#retries): 1 # default 1 retry on error/timeout when opening connections - role: # optional - sslmode: prefer # optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. - ra3_node: true # enables cross-database sources - autocommit: true # optional, enables autocommit after each statement - region: # optional + connect_timeout: None + [retries](#retries): 1 + role: None + sslmode: prefer + ra3_node: true + autocommit: true + region: us-east-1 + autocreate: true + db_groups: ['ANALYSTS'] + ``` +
    + +
    + + ### Specifying an IAM Profile When the `iam_profile` configuration is set, dbt will use the specified profile from your `~/.aws/config` file instead of using the profile name `default` diff --git a/website/docs/docs/core/connect-data-platform/snowflake-setup.md b/website/docs/docs/core/connect-data-platform/snowflake-setup.md index 6bc9c980922..98bcf447fed 100644 --- a/website/docs/docs/core/connect-data-platform/snowflake-setup.md +++ b/website/docs/docs/core/connect-data-platform/snowflake-setup.md @@ -124,7 +124,7 @@ Along with adding the `authenticator` parameter, be sure to run `alter account s To use key pair authentication, omit a `password` and instead provide a `private_key_path` and, optionally, a `private_key_passphrase` in your target. **Note:** Versions of dbt before 0.16.0 required that private keys were encrypted and a `private_key_passphrase` was provided. This behavior was changed in dbt v0.16.0. -Starting from [dbt v1.5.0](/docs/dbt-versions/core), you have the option to use a `private_key` string instead of a `private_key_path`. The `private_key` string should be in Base64-encoded DER format, representing the key bytes. Refer to [Snowflake documentation](https://docs.snowflake.com/developer-guide/python-connector/python-connector-example#using-key-pair-authentication-key-pair-rotation) for more info on how they generate the key. +Starting from [dbt v1.5.0](/docs/dbt-versions/core), you have the option to use a `private_key` string instead of a `private_key_path`. The `private_key` string should be in either Base64-encoded DER format, representing the key bytes, or a plain-text PEM format. Refer to [Snowflake documentation](https://docs.snowflake.com/developer-guide/python-connector/python-connector-example#using-key-pair-authentication-key-pair-rotation) for more info on how they generate the key. @@ -163,9 +163,13 @@ my-snowflake-db: ### SSO Authentication -To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. `authenticator` can be one of 'externalbrowser' or a valid Okta URL. +To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. +`authenticator` can be one of 'externalbrowser' or a valid Okta URL. -**Note**: By default, every connection that dbt opens will require you to re-authenticate in a browser. The Snowflake connector package supports caching your session token, but it [currently only supports Windows and Mac OS](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-use.html#optional-using-connection-caching-to-minimize-the-number-of-prompts-for-authentication). See [the Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for how to enable this feature in your account. +Refer to the following tabs for more info and examples: + + + @@ -175,15 +179,15 @@ my-snowflake-db: outputs: dev: type: snowflake - account: [account id] - user: [username] - role: [user role] + account: [account id] # Snowflake + user: [username] # Snowflake username + role: [user role] # Snowflake user role # SSO config authenticator: externalbrowser - database: [database name] - warehouse: [warehouse name] + database: [database name] # Snowflake database name + warehouse: [warehouse name] # Snowflake warehouse name schema: [dbt schema] threads: [between 1 and 8] client_session_keep_alive: False @@ -199,6 +203,50 @@ my-snowflake-db: + + + + + + +```yaml +my-snowflake-db: + target: dev + outputs: + dev: + type: snowflake + account: [account id] # Snowflake + user: [username] # Snowflake username + role: [user role] # Snowflake user role + + # SSO config -- The three following fields are REQUIRED + authenticator: [Okta account URL] + username: [Okta username] + password: [Okta password] + + database: [database name] # Snowflake database name + warehouse: [warehouse name] # Snowflake warehouse name + schema: [dbt schema] + threads: [between 1 and 8] + client_session_keep_alive: False + query_tag: [anything] + + # optional + connect_retries: 0 # default 0 + connect_timeout: 10 # default: 10 + retry_on_database_errors: False # default: false + retry_all: False # default: false + reuse_connections: False # default: false +``` + + + + + + +**Note**: By default, every connection that dbt opens will require you to re-authenticate in a browser. The Snowflake connector package supports caching your session token, but it [currently only supports Windows and Mac OS](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-use.html#optional-using-connection-caching-to-minimize-the-number-of-prompts-for-authentication). + +Refer to the [Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for info on how to enable this feature in your account. ## Configurations @@ -224,7 +272,7 @@ The "base" configs for Snowflake targets are shown below. Note that you should a | reuse_connections | No | A boolean flag indicating whether to reuse idle connections to help reduce total connections opened. Default is `False`. | ### account -For AWS accounts in the US West default region, you can use `abc123` (without any other segments). For some AWS accounts you will have to append the region and/or cloud platform. For example, `abc123.eu-west-1` or `abc123.eu-west-2.aws`. For GCP and Azure-based accounts, you have to append the region and cloud platform, such as `gcp` or `azure`, respectively. For example, `abc123.us-central1.gcp`. For details, see Snowflake's documentation: "[Specifying Region Information in Your Account Hostname](https://docs.snowflake.com/en/user-guide/intro-regions.html#specifying-region-information-in-your-account-hostname)" and "[Account Identifier Formats by Cloud Platform and Region](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#account-identifier-formats-by-cloud-platform-and-region)". +For AWS accounts in the US West default region, you can use `abc123` (without any other segments). For some AWS accounts you will have to append the region and/or cloud platform. For example, `abc123.eu-west-1` or `abc123.eu-west-2.aws`. For GCP and Azure-based accounts, you have to append the region and cloud platform, such as `gcp` or `azure`, respectively. For example, `abc123.us-central1.gcp`. For details, see Snowflake's documentation: "[Specifying Region Information in Your Account Hostname](https://docs.snowflake.com/en/user-guide/intro-regions.html#specifying-region-information-in-your-account-hostname)". Please also note that the Snowflake account name should only be the without the prefixed . Relevant documentation: "[Account Identifier Formats by Cloud Platform and Region](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#account-identifier-formats-by-cloud-platform-and-region)". ### client_session_keep_alive diff --git a/website/docs/docs/core/connect-data-platform/spark-setup.md b/website/docs/docs/core/connect-data-platform/spark-setup.md index 2e3b5a66de8..895f0559953 100644 --- a/website/docs/docs/core/connect-data-platform/spark-setup.md +++ b/website/docs/docs/core/connect-data-platform/spark-setup.md @@ -57,15 +57,11 @@ $ pip install "dbt-spark[ODBC]" $ pip install "dbt-spark[PyHive]" ``` - - ```zsh # session connections $ pip install "dbt-spark[session]" ``` - -

    Configuring {frontMatter.meta.pypi_package}

    For {frontMatter.meta.platform_name}-specific configuration please refer to {frontMatter.meta.platform_name} Configuration

    @@ -80,7 +76,6 @@ dbt-spark can connect to Spark clusters by three different methods: - [`thrift`](#thrift) connects directly to the lead node of a cluster, either locally hosted / on premise or in the cloud (e.g. Amazon EMR). - [`http`](#http) is a more generic method for connecting to a managed service that provides an HTTP endpoint. Currently, this includes connections to a Databricks interactive cluster. - - [`session`](#session) connects to a pySpark session, running locally or on a remote machine. @@ -88,12 +83,9 @@ dbt-spark can connect to Spark clusters by three different methods: The `session` connection method is intended for advanced users and experimental dbt development. This connection method is not supported by dbt Cloud. ::: - ### ODBC -New in v0.18.1 - Use the `odbc` connection method if you are connecting to a Databricks SQL endpoint or interactive cluster via ODBC driver. (Download the latest version of the official driver [here](https://databricks.com/spark/odbc-driver-download).) @@ -119,9 +111,7 @@ your_profile_name: port: [port] # default 443 user: [user] server_side_parameters: - # cluster configuration parameters, otherwise applied via `SET` statements - # for example: - # "spark.databricks.delta.schema.autoMerge.enabled": True + "spark.driver.memory": "4g" ``` @@ -148,6 +138,8 @@ your_profile_name: auth: [e.g. KERBEROS] kerberos_service_name: [e.g. hive] use_ssl: [true|false] # value of hive.server2.use.SSL, default false + server_side_parameters: + "spark.driver.memory": "4g" ```
    @@ -176,6 +168,8 @@ your_profile_name: user: [user] connect_timeout: 60 # default 10 connect_retries: 5 # default 0 + server_side_parameters: + "spark.driver.memory": "4g" ``` @@ -184,8 +178,6 @@ Databricks interactive clusters can take several minutes to start up. You may include the optional profile configs `connect_timeout` and `connect_retries`, and dbt will periodically retry the connection. - - ### Session Use the `session` method if you want to run `dbt` against a pySpark session. @@ -201,14 +193,12 @@ your_profile_name: method: session schema: [database/schema name] host: NA # not used, but required by `dbt-core` + server_side_parameters: + "spark.driver.memory": "4g" ``` - - - - ## Optional configurations ### Retries @@ -227,6 +217,12 @@ connect_retries: 3 + + + +### Server side configuration + +Spark can be customized using [Application Properties](https://spark.apache.org/docs/latest/configuration.html). Using these properties the execution can be customized, for example, to allocate more memory to the driver process. Also, the Spark SQL runtime can be set through these properties. For example, this allows the user to [set a Spark catalogs](https://spark.apache.org/docs/latest/configuration.html#spark-sql). ## Caveats diff --git a/website/docs/docs/core/connect-data-platform/starrocks-setup.md b/website/docs/docs/core/connect-data-platform/starrocks-setup.md new file mode 100644 index 00000000000..e5c1abac037 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/starrocks-setup.md @@ -0,0 +1,103 @@ +--- +title: "Starrocks setup" +description: "Read this guide to learn about the Starrocks warehouse setup in dbt." +id: "starrocks-setup" +meta: + maintained_by: Starrocks + authors: Astralidea + github_repo: 'StarRocks/starrocks/tree/main/contrib/dbt-connector' + pypi_package: 'dbt-starrocks' + min_core_version: 'v1.6.2' + min_supported_version: 'Starrocks 2.5' + cloud_support: Not Supported + slack_channel_name: '#db-starrocks' + slack_channel_link: 'https://www.getdbt.com/community' + platform_name: 'Starrocks' + config_page: '/reference/resource-configs/starrocks-configs' +--- + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + + +## Authentication Methods + +### User / Password Authentication + +Starrocks can be configured using basic user/password authentication as shown below. + + + +```yaml +my-starrocks-db: + target: dev + outputs: + dev: + type: starrocks + host: localhost + port: 9030 + schema: analytics + + # User/password auth + username: your_starrocks_username + password: your_starrocks_password +``` + + + +#### Description of Profile Fields +| Option | Description | Required? | Example | +|----------|--------------------------------------------------------|-----------|--------------------------------| +| type | The specific adapter to use | Required | `starrocks` | +| host | The hostname to connect to | Required | `192.168.100.28` | +| port | The port to use | Required | `9030` | +| schema | Specify the schema (database) to build models into | Required | `analytics` | +| username | The username to use to connect to the server | Required | `dbt_admin` | +| password | The password to use for authenticating to the server | Required | `correct-horse-battery-staple` | +| version | Let Plugin try to go to a compatible starrocks version | Optional | `3.1.0` | + +## Supported features + +| Starrocks <= 2.5 | Starrocks 2.5 ~ 3.1 | Starrocks >= 3.1 | Feature | +|:----------------:|:--------------------:|:-----------------:|:---------------------------------:| +| ✅ | ✅ | ✅ | Table materialization | +| ✅ | ✅ | ✅ | View materialization | +| ❌ | ❌ | ✅ | Materialized View materialization | +| ❌ | ✅ | ✅ | Incremental materialization | +| ❌ | ✅ | ✅ | Primary Key Model | +| ✅ | ✅ | ✅ | Sources | +| ✅ | ✅ | ✅ | Custom data tests | +| ✅ | ✅ | ✅ | Docs generate | +| ❌ | ❌ | ❌ | Kafka | + +### Notice +1. When StarRocks Version < 2.5, `Create table as` can only set engine='OLAP' and table_type='DUPLICATE' +2. When StarRocks Version >= 2.5, `Create table as` supports table_type='PRIMARY' +3. When StarRocks Version < 3.1 distributed_by is required + +It is recommended to use the latest starrocks version and dbt-starrocks version for the best experience. \ No newline at end of file diff --git a/website/docs/docs/core/connect-data-platform/teradata-setup.md b/website/docs/docs/core/connect-data-platform/teradata-setup.md index 1fe33ff8929..85767edee72 100644 --- a/website/docs/docs/core/connect-data-platform/teradata-setup.md +++ b/website/docs/docs/core/connect-data-platform/teradata-setup.md @@ -4,7 +4,7 @@ description: "Read this guide to learn about the Teradata warehouse setup in dbt id: "teradata-setup" meta: maintained_by: Teradata - authors: Doug Beatty and Adam Tworkiewicz + authors: Teradata github_repo: 'Teradata/dbt-teradata' pypi_package: 'dbt-teradata' min_core_version: 'v0.21.0' @@ -41,6 +41,29 @@ pip is the easiest way to install the adapter:

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    +

    Python compatibility

    + +| Plugin version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | Python 3.11 | +| -------------- | ----------- | ----------- | ----------- | ----------- | ----------- | ------------ | +| 0.19.0.x | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ +| 0.20.0.x | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ +| 0.21.1.x | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ +| 1.0.0.x | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ +|1.1.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ +|1.2.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ +|1.3.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ +|1.4.x.x | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ +|1.5.x | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ +|1.6.x | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ + +

    dbt dependent packages version compatibility

    + +| dbt-teradata | dbt-core | dbt-teradata-util | dbt-util | +|--------------|------------|-------------------|----------------| +| 1.2.x | 1.2.x | 0.1.0 | 0.9.x or below | +| 1.6.7 | 1.6.7 | 1.1.1 | 1.1.1 | + +

    Configuring {frontMatter.meta.pypi_package}

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    @@ -88,11 +111,15 @@ The plugin also supports the following optional connection parameters: Parameter | Default | Type | Description ----------------------- | ----------- | -------------- | --- `account` | | string | Specifies the database account. Equivalent to the Teradata JDBC Driver `ACCOUNT` connection parameter. +`browser` | | string | Specifies the command to open the browser for Browser Authentication, when logmech is BROWSER. Browser Authentication is supported for Windows and macOS. Equivalent to the Teradata JDBC Driver BROWSER connection parameter. +`browser_tab_timeout` | `"5"` | quoted integer | Specifies the number of seconds to wait before closing the browser tab after Browser Authentication is completed. The default is 5 seconds. The behavior is under the browser's control, and not all browsers support automatic closing of browser tabs. +`browser_timeout` | `"180"` | quoted integer | Specifies the number of seconds that the driver will wait for Browser Authentication to complete. The default is 180 seconds (3 minutes). `column_name` | `"false"` | quoted boolean | Controls the behavior of cursor `.description` sequence `name` items. Equivalent to the Teradata JDBC Driver `COLUMN_NAME` connection parameter. False specifies that a cursor `.description` sequence `name` item provides the AS-clause name if available, or the column name if available, or the column title. True specifies that a cursor `.description` sequence `name` item provides the column name if available, but has no effect when StatementInfo parcel support is unavailable. `connect_failure_ttl` | `"0"` | quoted integer | Specifies the time-to-live in seconds to remember the most recent connection failure for each IP address/port combination. The driver subsequently skips connection attempts to that IP address/port for the duration of the time-to-live. The default value of zero disables this feature. The recommended value is half the database restart time. Equivalent to the Teradata JDBC Driver `CONNECT_FAILURE_TTL` connection parameter. +`connect_timeout` | `"10000"` | quoted integer | Specifies the timeout in milliseconds for establishing a TCP socket connection. Specify 0 for no timeout. The default is 10 seconds (10000 milliseconds). `cop` | `"true"` | quoted boolean | Specifies whether COP Discovery is performed. Equivalent to the Teradata JDBC Driver `COP` connection parameter. `coplast` | `"false"` | quoted boolean | Specifies how COP Discovery determines the last COP hostname. Equivalent to the Teradata JDBC Driver `COPLAST` connection parameter. When `coplast` is `false` or omitted, or COP Discovery is turned off, then no DNS lookup occurs for the coplast hostname. When `coplast` is `true`, and COP Discovery is turned on, then a DNS lookup occurs for a coplast hostname. -`dbs_port` | `"1025"` | quoted integer | Specifies the database port number. Equivalent to the Teradata JDBC Driver `DBS_PORT` connection parameter. +`port` | `"1025"` | quoted integer | Specifies the database port number. Equivalent to the Teradata JDBC Driver `DBS_PORT` connection parameter. `encryptdata` | `"false"` | quoted boolean | Controls encryption of data exchanged between the driver and the database. Equivalent to the Teradata JDBC Driver `ENCRYPTDATA` connection parameter. `fake_result_sets` | `"false"` | quoted boolean | Controls whether a fake result set containing statement metadata precedes each real result set. `field_quote` | `"\""` | string | Specifies a single character string used to quote fields in a CSV file. @@ -102,11 +129,18 @@ Parameter | Default | Type | Description `lob_support` | `"true"` | quoted boolean | Controls LOB support. Equivalent to the Teradata JDBC Driver `LOB_SUPPORT` connection parameter. `log` | `"0"` | quoted integer | Controls debug logging. Somewhat equivalent to the Teradata JDBC Driver `LOG` connection parameter. This parameter's behavior is subject to change in the future. This parameter's value is currently defined as an integer in which the 1-bit governs function and method tracing, the 2-bit governs debug logging, the 4-bit governs transmit and receive message hex dumps, and the 8-bit governs timing. Compose the value by adding together 1, 2, 4, and/or 8. `logdata` | | string | Specifies extra data for the chosen logon authentication method. Equivalent to the Teradata JDBC Driver `LOGDATA` connection parameter. +`logon_timeout` | `"0"` | quoted integer | Specifies the logon timeout in seconds. Zero means no timeout. `logmech` | `"TD2"` | string | Specifies the logon authentication method. Equivalent to the Teradata JDBC Driver `LOGMECH` connection parameter. Possible values are `TD2` (the default), `JWT`, `LDAP`, `KRB5` for Kerberos, or `TDNEGO`. `max_message_body` | `"2097000"` | quoted integer | Specifies the maximum Response Message size in bytes. Equivalent to the Teradata JDBC Driver `MAX_MESSAGE_BODY` connection parameter. `partition` | `"DBC/SQL"` | string | Specifies the database partition. Equivalent to the Teradata JDBC Driver `PARTITION` connection parameter. +`request_timeout` | `"0"` | quoted integer | Specifies the timeout for executing each SQL request. Zero means no timeout. +`retries` | `0` | integer | Allows an adapter to automatically try again when the attempt to open a new connection on the database has a transient, infrequent error. This option can be set using the retries configuration. Default value is 0. The default wait period between connection attempts is one second. retry_timeout (seconds) option allows us to adjust this waiting period. +`runstartup` | "false" | quoted boolean | Controls whether the user's STARTUP SQL request is executed after logon. For more information, refer to User STARTUP SQL Request. Equivalent to the Teradata JDBC Driver RUNSTARTUP connection parameter. If retries is set to 3, the adapter will try to establish a new connection three times if an error occurs. +`sessions` | | quoted integer | Specifies the number of data transfer connections for FastLoad or FastExport. The default (recommended) lets the database choose the appropriate number of connections. Equivalent to the Teradata JDBC Driver SESSIONS connection parameter. `sip_support` | `"true"` | quoted boolean | Controls whether StatementInfo parcel is used. Equivalent to the Teradata JDBC Driver `SIP_SUPPORT` connection parameter. +`sp_spl` | `"true"` | quoted boolean | Controls whether stored procedure source code is saved in the database when a SQL stored procedure is created. Equivalent to the Teradata JDBC Driver SP_SPL connection parameter. `sslca` | | string | Specifies the file name of a PEM file that contains Certificate Authority (CA) certificates for use with `sslmode` values `VERIFY-CA` or `VERIFY-FULL`. Equivalent to the Teradata JDBC Driver `SSLCA` connection parameter. +`sslcrc` | `"ALLOW"` | string | Equivalent to the Teradata JDBC Driver SSLCRC connection parameter. Values are case-insensitive.
    • ALLOW provides "soft fail" behavior such that communication failures are ignored during certificate revocation checking.
    • REQUIRE mandates that certificate revocation checking must succeed. `sslcapath` | | string | Specifies a directory of PEM files that contain Certificate Authority (CA) certificates for use with `sslmode` values `VERIFY-CA` or `VERIFY-FULL`. Only files with an extension of `.pem` are used. Other files in the specified directory are not used. Equivalent to the Teradata JDBC Driver `SSLCAPATH` connection parameter. `sslcipher` | | string | Specifies the TLS cipher for HTTPS/TLS connections. Equivalent to the Teradata JDBC Driver `SSLCIPHER` connection parameter. `sslmode` | `"PREFER"` | string | Specifies the mode for connections to the database. Equivalent to the Teradata JDBC Driver `SSLMODE` connection parameter.
    • `DISABLE` disables HTTPS/TLS connections and uses only non-TLS connections.
    • `ALLOW` uses non-TLS connections unless the database requires HTTPS/TLS connections.
    • `PREFER` uses HTTPS/TLS connections unless the database does not offer HTTPS/TLS connections.
    • `REQUIRE` uses only HTTPS/TLS connections.
    • `VERIFY-CA` uses only HTTPS/TLS connections and verifies that the server certificate is valid and trusted.
    • `VERIFY-FULL` uses only HTTPS/TLS connections, verifies that the server certificate is valid and trusted, and verifies that the server certificate matches the database hostname. @@ -124,6 +158,91 @@ For the full description of the connection parameters see https://github.com/Ter * `ephemeral` * `incremental` +#### Incremental Materialization +The following incremental materialization strategies are supported: +* `append` (default) +* `delete+insert` +* `merge` + +To learn more about dbt incremental strategies please check [the dbt incremental strategy documentation](https://docs.getdbt.com/docs/build/incremental-models#about-incremental_strategy). + ### Commands All dbt commands are supported. + +## Support for model contracts +Model contracts are not yet supported with dbt-teradata. + +## Support for `dbt-utils` package +`dbt-utils` package is supported through `teradata/teradata_utils` dbt package. The package provides a compatibility layer between `dbt_utils` and `dbt-teradata`. See [teradata_utils](https://hub.getdbt.com/teradata/teradata_utils/latest/) package for install instructions. + +### Cross DB macros +Starting with release 1.3, some macros were migrated from [teradata-dbt-utils](https://github.com/Teradata/dbt-teradata-utils) dbt package to the connector. See the table below for the macros supported from the connector. + +For using cross DB macros, teradata-utils as a macro namespace will not be used, as cross DB macros have been migrated from teradata-utils to Dbt-Teradata. + + +#### Compatibility + +| Macro Group | Macro Name | Status | Comment | +|:---------------------:|:-----------------------------:|:---------------------:|:----------------------------------------------------------------------:| +| Cross-database macros | current_timestamp | :white_check_mark: | custom macro provided | +| Cross-database macros | dateadd | :white_check_mark: | custom macro provided | +| Cross-database macros | datediff | :white_check_mark: | custom macro provided, see [compatibility note](#datediff) | +| Cross-database macros | split_part | :white_check_mark: | custom macro provided | +| Cross-database macros | date_trunc | :white_check_mark: | custom macro provided | +| Cross-database macros | hash | :white_check_mark: | custom macro provided, see [compatibility note](#hash) | +| Cross-database macros | replace | :white_check_mark: | custom macro provided | +| Cross-database macros | type_string | :white_check_mark: | custom macro provided | +| Cross-database macros | last_day | :white_check_mark: | no customization needed, see [compatibility note](#last_day) | +| Cross-database macros | width_bucket | :white_check_mark: | no customization + + +#### examples for cross DB macros + ##### replace + {{ dbt.replace("string_text_column", "old_chars", "new_chars") }} + {{ replace('abcgef', 'g', 'd') }} + + ##### date_trunc + {{ dbt.date_trunc("date_part", "date") }} + {{ dbt.date_trunc("DD", "'2018-01-05 12:00:00'") }} + + ##### datediff + `datediff` macro in teradata supports difference between dates. Differece between timestamps is not supported. + + ##### hash + + `Hash` macro needs an `md5` function implementation. Teradata doesn't support `md5` natively. You need to install a User Defined Function (UDF): + 1. Download the md5 UDF implementation from Teradata (registration required): https://downloads.teradata.com/download/extensibility/md5-message-digest-udf. + 1. Unzip the package and go to `src` directory. + 1. Start up `bteq` and connect to your database. + 1. Create database `GLOBAL_FUNCTIONS` that will host the UDF. You can't change the database name as it's hardcoded in the macro: + ```sql + CREATE DATABASE GLOBAL_FUNCTIONS AS PERMANENT = 60e6, SPOOL = 120e6; + ``` + 1. Create the UDF. Replace `` with your current database user: + ```sql + GRANT CREATE FUNCTION ON GLOBAL_FUNCTIONS TO ; + DATABASE GLOBAL_FUNCTIONS; + .run file = hash_md5.btq + ``` + 1. Grant permissions to run the UDF with grant option. + ```sql + GRANT EXECUTE FUNCTION ON GLOBAL_FUNCTIONS TO PUBLIC WITH GRANT OPTION; + ``` + ##### last_day + + `last_day` in `teradata_utils`, unlike the corresponding macro in `dbt_utils`, doesn't support `quarter` datepart. + +## Limitations + +### Transaction mode +Only ANSI transaction mode is supported. + +## Credits + +The adapter was originally created by [Doug Beatty](https://github.com/dbeatty10). Teradata took over the adapter in January 2022. We are grateful to Doug for founding the project and accelerating the integration of dbt + Teradata. + +## License + +The adapter is published using Apache-2.0 License. Refer to the [terms and conditions](https://github.com/dbt-labs/dbt-core/blob/main/License.md) to understand items such as creating derivative work and the support model. diff --git a/website/docs/docs/core/connect-data-platform/trino-setup.md b/website/docs/docs/core/connect-data-platform/trino-setup.md index 396634dc6e6..39d8ed8ab3f 100644 --- a/website/docs/docs/core/connect-data-platform/trino-setup.md +++ b/website/docs/docs/core/connect-data-platform/trino-setup.md @@ -83,7 +83,7 @@ The following profile fields are optional to set up. They let you configure your | Profile field | Example | Description | | ----------------------------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------- | | `threads` | `8` | How many threads dbt should use (default is `1`) | -| `roles` | `system: analyst` | Catalog roles | +| `roles` | `system: analyst` | Catalog roles can be set under the optional `roles` parameter using the following format: `catalog: role`. | | `session_properties` | `query_max_run_time: 4h` | Sets Trino session properties used in the connection. Execute `SHOW SESSION` to see available options | | `prepared_statements_enabled` | `true` or `false` | Enable usage of Trino prepared statements (used in `dbt seed` commands) (default: `true`) | | `retries` | `10` | Configure how many times all database operation is retried when connection issues arise (default: `3`) | diff --git a/website/docs/docs/core/connect-data-platform/upsolver-setup.md b/website/docs/docs/core/connect-data-platform/upsolver-setup.md new file mode 100644 index 00000000000..6b2f410fc07 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/upsolver-setup.md @@ -0,0 +1,85 @@ +--- +title: "Upsolver setup" +description: "Read this guide to learn how to configure Upsolver with dbt." +id: "upsolver-setup" +meta: + maintained_by: Upsolver Team + authors: Upsolver Team + github_repo: 'Upsolver/dbt-upsolver' + pypi_package: 'dbt-upsolver' + min_core_version: 'v1.5.0' + cloud_support: Not Supported + min_supported_version: 'n/a' + slack_channel_name: 'Upsolver Comunity' + slack_channel_link: 'https://join.slack.com/t/upsolvercommunity/shared_invite/zt-1zo1dbyys-hj28WfaZvMh4Z4Id3OkkhA' + platform_name: 'Upsolver' + config_page: '/reference/resource-configs/upsolver-configs' +pagination_next: null +--- + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + +## Authentication Methods + +### User / Token authentication + +Upsolver can be configured using basic user/token authentication as shown below. + + + +```yaml +my-upsolver-db: + target: dev + outputs: + dev: + type: upsolver + api_url: https://mt-api-prod.upsolver.com + + user: [username] + token: [token] + + database: [database name] + schema: [schema name] + threads: [1 or more] + + ``` + + + +## Configurations + +The configs for Upsolver targets are shown below. + +### All configurations + +| Config | Required? | Description | +| ------ | --------- | ----------- | +| token | Yes | The token to connect Upsolver [Upsolver's documentation](https://docs.upsolver.com/sqlake/api-integration) | +| user | Yes | The user to log in as | +| database | Yes | The database that dbt should create models in | +| schema | Yes | The schema to build models into by default | +| api_url | Yes | The API url to connect. Common value ```https://mt-api-prod.upsolver.com``` | diff --git a/website/docs/docs/core/dbt-core-environments.md b/website/docs/docs/core/dbt-core-environments.md index 5daf17bddf9..c7f340557fd 100644 --- a/website/docs/docs/core/dbt-core-environments.md +++ b/website/docs/docs/core/dbt-core-environments.md @@ -1,6 +1,7 @@ --- title: "dbt Core environments" id: "dbt-core-environments" +pagination_next: "docs/running-a-dbt-project/run-your-dbt-projects" --- dbt makes it easy to maintain separate production and development environments through the use of [targets](/reference/dbt-jinja-functions/target.md) within a [profile](/docs/core/connect-data-platform/profiles.yml). A typical profile, when using dbt locally (for example, running from your command line), will have a target named `dev` and have this set as the default. This means that while making changes, your objects will be built in your _development_ target without affecting production queries made by your end users. Once you are confident in your changes, you can deploy the code to _production_, by running your dbt project with a _prod_ target. diff --git a/website/docs/docs/core/homebrew-install.md b/website/docs/docs/core/homebrew-install.md index ab80cc1148f..2e2676c4a95 100644 --- a/website/docs/docs/core/homebrew-install.md +++ b/website/docs/docs/core/homebrew-install.md @@ -3,6 +3,13 @@ title: "Install with Homebrew" description: "You can use Homebrew to install dbt Core and adapter plugins from the command line." --- +:::caution + +Starting with v1.6, dbt Labs will no longer maintain Homebrew formulae as a supported installation method for dbt-core and adapters. For more on our rationale, consult this discussion: +- [Installing dbt Core: saying goodbye to brew and hello to "bundles"](https://github.com/dbt-labs/dbt-core/discussions/8277) + +::: + dbt Labs maintains Homebrew formulae for the four oldest and most popular adapter plugins: Postgres, Redshift, Snowflake, and BigQuery. We recommend you use Homebrew if you meet these conditions: diff --git a/website/docs/docs/core/installation-overview.md b/website/docs/docs/core/installation-overview.md index f1fdb800fdf..cb1df26b0f8 100644 --- a/website/docs/docs/core/installation-overview.md +++ b/website/docs/docs/core/installation-overview.md @@ -2,6 +2,8 @@ title: "About installing dbt" id: "installation" description: "You can install dbt Core using a few different tested methods." +pagination_next: "docs/core/homebrew-install" +pagination_prev: null --- You can install dbt Core on the command line by using one of these methods: @@ -11,9 +13,17 @@ You can install dbt Core on the command line by using one of these methods: - [Use a Docker image to install dbt](/docs/core/docker-install) - [Install dbt from source](/docs/core/source-install) +:::tip Pro tip: Using the --help flag + +Most command-line tools, including dbt, have a `--help` flag that you can use to show available commands and arguments. For example, you can use the `--help` flag with dbt in two ways:

    +— `dbt --help`: Lists the commands available for dbt
    +— `dbt run --help`: Lists the flags available for the `run` command + +::: + ## Upgrading dbt Core -dbt provides a number of resources for understanding [general best practices](/blog/upgrade-dbt-without-fear) while upgrading your dbt project as well as detailed [migration guides](/guides/migration/versions/upgrading-to-v1.4) highlighting the changes required for each minor and major release, and [core versions](/docs/dbt-versions/core) +dbt provides a number of resources for understanding [general best practices](/blog/upgrade-dbt-without-fear) while upgrading your dbt project as well as detailed [migration guides](/docs/dbt-versions/core-upgrade/upgrading-to-v1.4) highlighting the changes required for each minor and major release, and [core versions](/docs/dbt-versions/core) - [Upgrade Homebrew](/docs/core/homebrew-install#upgrading-dbt-and-your-adapter) - [Upgrade `pip`](/docs/core/pip-install#change-dbt-core-versions) diff --git a/website/docs/docs/core/pip-install.md b/website/docs/docs/core/pip-install.md index 26a15d8ad37..44fac00e493 100644 --- a/website/docs/docs/core/pip-install.md +++ b/website/docs/docs/core/pip-install.md @@ -5,14 +5,37 @@ description: "You can use pip to install dbt Core and adapter plugins from the c You need to use `pip` to install dbt Core on Windows or Linux operating systems. You can use `pip` or [Homebrew](/docs/core/homebrew-install) for installing dbt Core on a MacOS. -You can install dbt Core and plugins using `pip` because they are Python modules distributed on [PyPI](https://pypi.org/project/dbt/). We recommend using virtual environments when installing with `pip`. - +You can install dbt Core and plugins using `pip` because they are Python modules distributed on [PyPI](https://pypi.org/project/dbt-core/). - +### Using virtual environments +We recommend using virtual environments (venv) to namespace pip modules. + +1. Create a new venv: + +```shell +python3 -m venv dbt-env # create the environment +``` + +2. Activate that same virtual environment each time you create a shell window or session: + +```shell +source dbt-env/bin/activate # activate the environment for Mac and Linux OR +dbt-env\Scripts\activate # activate the environment for Windows +``` + +#### Create an alias +To activate your dbt environment with every new shell window or session, you can create an alias for the source command in your $HOME/.bashrc, $HOME/.zshrc, or whichever config file your shell draws from. + +For example, add the following to your rc file, replacing with the path to your virtual environment configuration. + +```shell +alias env_dbt='source /bin/activate' +``` +### Installing the adapter Once you know [which adapter](/docs/supported-data-platforms) you're using, you can install it as `dbt-`. For example, if using Postgres: ```shell diff --git a/website/docs/docs/core/source-install.md b/website/docs/docs/core/source-install.md index be9918223fe..42086159c03 100644 --- a/website/docs/docs/core/source-install.md +++ b/website/docs/docs/core/source-install.md @@ -1,6 +1,7 @@ --- title: "Install from source" description: "You can install dbt Core from its GitHub code source." +pagination_next: null --- dbt Core and almost all of its adapter plugins are open source software. As such, the codebases are freely available to download and build from source. You might install from source if you want the latest code or want to install dbt from a specific commit. This might be helpful when you are contributing changes, or if you want to debug a past change. diff --git a/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md b/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md index 8a5712f40df..168ec0c80f4 100644 --- a/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md +++ b/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md @@ -1,6 +1,7 @@ --- title: "dbt Cloud Administrative API" id: "admin-cloud-api" +pagination_next: "docs/dbt-cloud-apis/discovery-api" --- The dbt Cloud Administrative API is enabled by default for [Team and Enterprise plans](https://www.getdbt.com/pricing/). It can be used to: diff --git a/website/docs/docs/dbt-cloud-apis/apis-overview.md b/website/docs/docs/dbt-cloud-apis/apis-overview.md index 9f7c22a7580..eef64992af9 100644 --- a/website/docs/docs/dbt-cloud-apis/apis-overview.md +++ b/website/docs/docs/dbt-cloud-apis/apis-overview.md @@ -2,16 +2,19 @@ title: "APIs Overview" description: "Learn how dbt accounts on the Team and Enterprise plans can query the dbt Cloud APIs." id: "overview" +pagination_next: "docs/dbt-cloud-apis/user-tokens" +pagination_prev: null --- ## Overview Accounts on the _Team_ and _Enterprise_ plans can query the dbt Cloud APIs. -dbt Cloud provides two APIs: +dbt Cloud provides the following APIs: - The [dbt Cloud Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) can be used to administrate a dbt Cloud account. - The [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api) can be used to fetch metadata related to the state and health of your dbt project. +- The [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) provides multiple API options which allow you to query your metrics defined in the dbt Semantic Layer. If you want to learn more about webhooks, refer to [Webhooks for your jobs](/docs/deploy/webhooks). diff --git a/website/docs/docs/dbt-cloud-apis/authentication.md b/website/docs/docs/dbt-cloud-apis/authentication.md new file mode 100644 index 00000000000..7deadd68f18 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/authentication.md @@ -0,0 +1,22 @@ +--- +title: "Authentication" +description: "Learn how to authenticate with user tokens and service account tokens " +pagination_next: "docs/dbt-cloud-apis/user-tokens" +pagination_prev: null +--- + +
    + + + + + +
    \ No newline at end of file diff --git a/website/docs/docs/dbt-cloud-apis/discovery-api.md b/website/docs/docs/dbt-cloud-apis/discovery-api.md index 16c9bc16ec4..747128cf7bc 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-api.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-api.md @@ -1,10 +1,11 @@ --- title: "About the Discovery API" +pagination_next: "docs/dbt-cloud-apis/discovery-use-cases-and-examples" --- -Every time dbt Cloud runs a project, it generates and stores information about the project. The metadata includes details about your project’s models, sources, and other nodes along with their execution results. With the dbt Cloud Discovery API, you can query this comprehensive information to gain a better understanding of your DAG and the data it produces. +Every time dbt Cloud runs a project, it generates and stores information about the project. The metadata includes details about your project’s models, sources, and other nodes along with their execution results. With the dbt Cloud Discovery API, you can query this comprehensive information to gain a better understanding of your DAG and the data it produces. -By leveraging the metadata in dbt Cloud, you can create systems for data monitoring and alerting, lineage exploration, and automated reporting. This can help you improve data discovery, data quality, and pipeline operations within your organization. +By leveraging the metadata in dbt Cloud, you can create systems for data monitoring and alerting, lineage exploration, and automated reporting. This can help you improve data discovery, data quality, and pipeline operations within your organization. You can access the Discovery API through [ad hoc queries](/docs/dbt-cloud-apis/discovery-querying), custom applications, a wide range of [partner ecosystem integrations](https://www.getdbt.com/product/integrations/) (like BI/analytics, catalog and governance, and quality and observability), and by using dbt Cloud features like [model timing](/docs/deploy/run-visibility#model-timing) and [dashboard status tiles](/docs/deploy/dashboard-status-tiles). @@ -17,13 +18,13 @@ You can query the dbt Cloud metadata: - At the job level for results on a specific dbt Cloud job run for a given resource type, like `models` or `test`. :::tip Public Preview -The Discovery API is currently available in Public Preview for dbt Cloud accounts on a Team or Enterprise plan. It’s available to all multi-tenant and to only select single-tenant accounts (please ask your account team to confirm). Preview features are stable and can be considered for production deployments, but there might still be some planned additions and modifications to product behavior before moving to General Availability. For details, refer to [dbt Product lifecycles](/docs/dbt-versions/product-lifecycles). +The Discovery API is currently available in Public Preview for dbt Cloud accounts on a Team or Enterprise plan. It’s available to all multi-tenant and to only select single-tenant accounts (please ask your account team to confirm). Preview features are stable and can be considered for production deployments, but there might still be some planned additions and modifications to product behavior before moving to General Availability. For details, refer to [dbt Product lifecycles](/docs/dbt-versions/product-lifecycles). ::: ## What you can use the Discovery API for -Click the tabs below to learn more about the API's use cases, the analysis you can do, and the results you can achieve by integrating with it. +Click the tabs below to learn more about the API's use cases, the analysis you can do, and the results you can achieve by integrating with it. To use the API directly or integrate your tool with it, refer to [Uses case and examples](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) for detailed information. @@ -33,7 +34,7 @@ To use the API directly or integrate your tool with it, refer to [Uses case and Use the API to look at historical information like model build time to determine the health of your dbt projects. Finding inefficiencies in orchestration configurations can help decrease infrastructure costs and improve timeliness. To learn more about how to do this, refer to [Performance](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#performance). -You can use, for example, the [model timing](/docs/deploy/run-visibility#model-timing) tab to help identify and optimize bottlenecks in model builds: +You can use, for example, the [model timing](/docs/deploy/run-visibility#model-timing) tab to help identify and optimize bottlenecks in model builds: @@ -53,7 +54,7 @@ Use the API to find and understand dbt assets in integrated tools using informat Data producers must manage and organize data for stakeholders, while data consumers need to quickly and confidently analyze data on a large scale to make informed decisions that improve business outcomes and reduce organizational overhead. The API is useful for discovery data experiences in catalogs, analytics, apps, and machine learning (ML) tools. It can help you understand the origin and meaning of datasets for your analysis. - + @@ -75,7 +76,7 @@ Use the API to review dataset changes and uses by examining exposures, lineage, ## Types of project state -There are two types of [project state](/docs/dbt-cloud-apis/project-state) at the environment level that you can query the results of: +There are two types of [project state](/docs/dbt-cloud-apis/project-state) at the environment level that you can query the results of: - **Definition** — The logical state of a dbt project’s [resources](/docs/build/projects) that update when the project is changed. - **Applied** — The output of successful dbt DAG execution that creates or describes the state of the database (for example: `dbt run`, `dbt test`, source freshness, and so on) @@ -86,5 +87,4 @@ These states allow you to easily examine the difference between a model’s defi - [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) - [Query the Discovery API](/docs/dbt-cloud-apis/discovery-querying) -- [Schema](/docs/dbt-cloud-apis/discovery-schema-model) - +- [Schema](/docs/dbt-cloud-apis/discovery-schema-job) diff --git a/website/docs/docs/dbt-cloud-apis/discovery-querying.md b/website/docs/docs/dbt-cloud-apis/discovery-querying.md index 77fed109c68..35c092adb4b 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-querying.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-querying.md @@ -1,14 +1,15 @@ --- title: "Query the Discovery API" id: "discovery-querying" -sidebar_label: "Query the Discovery API" +sidebar_label: "Query the Discovery API" +pagination_next: "docs/dbt-cloud-apis/discovery-schema-environment" --- -The Discovery API supports ad-hoc queries and integrations.. If you are new to the API, read the [Discovery API overview](/docs/dbt-cloud-apis/discovery-api) for an introduction. +The Discovery API supports ad-hoc queries and integrations. If you are new to the API, refer to [About the Discovery API](/docs/dbt-cloud-apis/discovery-api) for an introduction. -Use the Discovery API to evaluate data pipeline health and project state across runs or at a moment in time. dbt Labs provide a [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql) for this API, enabling you to run queries and browse the schema. +Use the Discovery API to evaluate data pipeline health and project state across runs or at a moment in time. dbt Labs provide a [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql) for this API, enabling you to run queries and browse the schema. -Since GraphQL describes the data in the API, the schema displayed in the GraphQL explorer accurately represents the graph and fields available to query. +Since GraphQL describes the data in the API, the schema displayed in the GraphQL explorer accurately represents the graph and fields available to query. @@ -16,17 +17,17 @@ Since GraphQL describes the data in the API, the schema displayed in the GraphQL Currently, authorization of requests takes place [using a service token](/docs/dbt-cloud-apis/service-tokens). dbt Cloud admin users can generate a Metadata Only service token that is authorized to execute a specific query against the Discovery API. -Once you've created a token, you can use it in the Authorization header of requests to the dbt Cloud Discovery API. Be sure to include the Token prefix in the Authorization header, or the request will fail with a `401 Unauthorized` error. Note that `Bearer` can be used instead of `Token` in the Authorization header. Both syntaxes are equivalent. +Once you've created a token, you can use it in the Authorization header of requests to the dbt Cloud Discovery API. Be sure to include the Token prefix in the Authorization header, or the request will fail with a `401 Unauthorized` error. Note that `Bearer` can be used instead of `Token` in the Authorization header. Both syntaxes are equivalent. -## Access the Discovery API +## Access the Discovery API 1. Create a [service account token](/docs/dbt-cloud-apis/service-tokens) to authorize requests. dbt Cloud Admin users can generate a _Metadata Only_ service token, which can be used to execute a specific query against the Discovery API to authorize requests. -2. Find your API URL using the endpoint `https://metadata.{YOUR_ACCESS_URL}/graphql`. +2. Find your API URL using the endpoint `https://metadata.{YOUR_ACCESS_URL}/graphql`. * Replace `{YOUR_ACCESS_URL}` with the appropriate [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. For example, if your multi-tenant region is North America, your endpoint is `https://metadata.cloud.getdbt.com/graphql`. If your multi-tenant region is EMEA, your endpoint is `https://metadata.emea.dbt.com/graphql`. -3. For specific query points, refer to the [schema documentation](/docs/dbt-cloud-apis/discovery-schema-model). +3. For specific query points, refer to the [schema documentation](/docs/dbt-cloud-apis/discovery-schema-job). ## Run queries using HTTP requests @@ -36,7 +37,7 @@ You can run queries by sending a `POST` request to the `https://metadata.YOUR_AC * `YOUR_TOKEN` in the Authorization header with your actual API token. Be sure to include the Token prefix. * `QUERY_BODY` with a GraphQL query, for example `{ "query": "" }` * `VARIABLES` with a dictionary of your GraphQL query variables, such as a job ID or a filter. -* `ENDPOINT` with the endpoint you're querying, such as environment. +* `ENDPOINT` with the endpoint you're querying, such as environment. ```shell curl 'https://metadata.YOUR_ACCESS_URL/graphql' \ @@ -48,10 +49,13 @@ You can run queries by sending a `POST` request to the `https://metadata.YOUR_AC Python example: -```py -response = requests.post('YOUR_ACCESS_URL', -headers={"authorization": "Bearer "+YOUR_TOKEN, "content-type": "application/json"}, -json={"query": QUERY_BODY, "variables": VARIABLES}) +```python +response = requests.post( + 'YOUR_ACCESS_URL', + headers={"authorization": "Bearer "+YOUR_TOKEN, "content-type": "application/json"}, + json={"query": QUERY_BODY, "variables": VARIABLES} +) + metadata = response.json()['data'][ENDPOINT] ``` @@ -63,75 +67,82 @@ There are several illustrative example queries on this page. For more examples, ## Reasonable use Discovery (GraphQL) API usage is subject to request rate and response size limits to maintain the performance and stability of the metadata platform and prevent abuse. -- The current request rate limit is 200 requests for a given IP address within a minute. If you exceed this limit, you will receive an HTTP 429 response status. -- Environment-level endpoints will be subject to response size limits in the future. The depth of the graph should not exceed three levels. A user can paginate up to 500 items per query. -- Job-level endpoints are subject to query complexity limits. Nested nodes (like parents), code (like rawCode), and catalog columns are considered as most complex. Overly complex queries should be broken up into separate queries with only necessary fields included. dbt Labs recommends using the environment endpoint instead for most use cases to get the latest descriptive and result metadata for a dbt Cloud project. + +Job-level endpoints are subject to query complexity limits. Nested nodes (like parents), code (like rawCode), and catalog columns are considered as most complex. Overly complex queries should be broken up into separate queries with only necessary fields included. dbt Labs recommends using the environment endpoint instead for most use cases to get the latest descriptive and result metadata for a dbt Cloud project. ## Retention limits You can use the Discovery API to query data from the previous three months. For example, if today was April 1st, you could query data back to January 1st. ## Run queries with the GraphQL explorer -You can run ad-hoc queries directly in the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and use the document explorer on the left-hand side, where you can see all possible nodes and fields. +You can run ad-hoc queries directly in the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and use the document explorer on the left-hand side to see all possible nodes and fields. + +Refer to the [Apollo explorer documentation](https://www.apollographql.com/docs/graphos/explorer/explorer) for setup and authorization info. -Refer to the [Apollo explorer documentation](https://www.apollographql.com/docs/graphos/explorer/explorer) for setup and authorization info. +1. Access the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and select fields you want to query. -1. Access the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and select fields you'd like query. +2. Select **Variables** at the bottom of the explorer and replace any `null` fields with your unique values. -2. Go to **Variables** at the bottom of the explorer and replace any `null` fields with your unique values. +3. [Authenticate](https://www.apollographql.com/docs/graphos/explorer/connecting-authenticating#authentication) using Bearer auth with `YOUR_TOKEN`. Select **Headers** at the bottom of the explorer and select **+New header**. -3. [Authenticate](https://www.apollographql.com/docs/graphos/explorer/connecting-authenticating#authentication) via Bearer auth with `YOUR_TOKEN`. Go to **Headers** at the bottom of the explorer and select **+New header**. +4. Select **Authorization** in the **header key** dropdown list and enter your Bearer auth token in the **value** field. Remember to include the Token prefix. Your header key should be in this format: `{"Authorization": "Bearer }`. + + + -4. Select **Authorization** in the **header key** drop-down list and enter your Bearer auth token in the **value** field. Remember to include the Token prefix. Your header key should look like this `{"Authorization": "Bearer }`.
    -5. Run your query by pressing the blue query button in the top-right of the Operation editor (to the right of the query). You should see a successful query response on the right side of the explorer. +1. Run your query by clicking the blue query button in the top right of the **Operation** editor (to the right of the query). You should see a successful query response on the right side of the explorer. + + + ### Fragments -Use the [`..on`](https://www.apollographql.com/docs/react/data/fragments/) notation to query across lineage and retrieve results from specific node types. +Use the [`... on`](https://www.apollographql.com/docs/react/data/fragments/) notation to query across lineage and retrieve results from specific node types. ```graphql - -environment(id: $environmentId) { - applied { - models(first: $first,filter:{uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - name - ancestors(types:[Model, Source, Seed, Snapshot]) { - ... on ModelAppliedStateNode { - name - resourceType - materializedType - executionInfo { - executeCompletedAt +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" }) { + edges { + node { + name + ancestors(types: [Model, Source, Seed, Snapshot]) { + ... on ModelAppliedStateNestedNode { + name + resourceType + materializedType + executionInfo { + executeCompletedAt + } } - } - ... on SourceAppliedStateNode { - sourceName - name - resourceType - freshness { - maxLoadedAt + ... on SourceAppliedStateNestedNode { + sourceName + name + resourceType + freshness { + maxLoadedAt + } } - } - ... on SnapshotAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt + ... on SnapshotAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } } - } - ... on SeedAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt + ... on SeedAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } } } } @@ -140,56 +151,59 @@ environment(id: $environmentId) { } } } - ``` ### Pagination -Querying large datasets can impact performance on multiple functions in the API pipeline. Pagination eases the burden by returning smaller data sets one page at a time. This is useful for returning a particular portion of the dataset or the entire dataset piece-by-piece to enhance performance. dbt Cloud utilizes cursor-based pagination, which makes it easy to return pages of constantly changing data. +Querying large datasets can impact performance on multiple functions in the API pipeline. Pagination eases the burden by returning smaller data sets one page at a time. This is useful for returning a particular portion of the dataset or the entire dataset piece-by-piece to enhance performance. dbt Cloud utilizes cursor-based pagination, which makes it easy to return pages of constantly changing data. -Use the `PageInfo` object to return information about the page. The following fields are available: +Use the `PageInfo` object to return information about the page. The available fields are: -- `startCursor` string type - corresponds to the first `node` in the `edge`. -- `endCursor` string type - corresponds to the last `node` in the `edge`. -- `hasNextPage` boolean type - whether there are more `nodes` after the returned results. -- `hasPreviousPage` boolean type - whether `nodes` exist before the returned results. +- `startCursor` string type — Corresponds to the first `node` in the `edge`. +- `endCursor` string type — Corresponds to the last `node` in the `edge`. +- `hasNextPage` boolean type — Whether or not there are more `nodes` after the returned results. There are connection variables available when making the query: -- `first` integer type - will return the first 'n' `nodes` for each page, up to 500. -- `after` string type sets the cursor to retrieve `nodes` after. It's best practice to set the `after` variable with the object ID defined in the `endcursor` of the previous page. +- `first` integer type — Returns the first n `nodes` for each page, up to 500. +- `after` string type — Sets the cursor to retrieve `nodes` after. It's best practice to set the `after` variable with the object ID defined in the `endCursor` of the previous page. + +Below is an example that returns the `first` 500 models `after` the specified Object ID in the variables. The `PageInfo` object returns where the object ID where the cursor starts, where it ends, and whether there is a next page. -The following example shows that we're returning the `first` 500 models `after` the specified Object ID in the variables. The `PageInfo` object will return where the object ID where the cursor starts, where it ends, and whether there is a next page. + + - + -Here is a code example of the `PageInfo` object: +Below is a code example of the `PageInfo` object: ```graphql pageInfo { - startCursor - endCursor - hasNextPage - } - totalCount # Total number of pages - + startCursor + endCursor + hasNextPage +} +totalCount # Total number of records across all pages ``` ### Filters -Filtering helps to narrow down the results of an API query. Want to query and return only models and tests that are failing? Or find models that are taking too long to run? You can fetch execution details such as [`executionTime`](/docs/dbt-cloud-apis/discovery-schema-models#fields), [`runElapsedTime`](/docs/dbt-cloud-apis/discovery-schema-models#fields), or [`status`](/docs/dbt-cloud-apis/discovery-schema-models#fields). This helps data teams monitor the performance of their models, identify bottlenecks, and optimize the overall data pipeline. +Filtering helps to narrow down the results of an API query. If you want to query and return only models and tests that are failing or find models that are taking too long to run, you can fetch execution details such as [`executionTime`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields), [`runElapsedTime`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields), or [`status`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields). This helps data teams monitor the performance of their models, identify bottlenecks, and optimize the overall data pipeline. -In the following example, we can see that we're filtering results to models that have succeeded on their `lastRunStatus`: +Below is an example that filters for results of models that have succeeded on their `lastRunStatus`: - + -Here is a code example that filters for models that have an error on their last run and tests that have failed: +Below is an example that filters for models that have an error on their last run and tests that have failed: -```graphql + + -environment(id: $environmentId) { +```graphql +query ModelsAndTests($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { applied { - models(first: $first, filter: {lastRunStatus:error}) { + models(first: $first, filter: { lastRunStatus: error }) { edges { node { name @@ -199,7 +213,7 @@ environment(id: $environmentId) { } } } - tests(first: $first, filter: {status:"fail"}) { + tests(first: $first, filter: { status: "fail" }) { edges { node { name @@ -208,12 +222,13 @@ environment(id: $environmentId) { } } } - } + } + } + } } - ``` ## Related content - [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) -- [Schema](/docs/dbt-cloud-apis/discovery-schema-model) +- [Schema](/docs/dbt-cloud-apis/discovery-schema-job) diff --git a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md index 030688d9aeb..8efb1ec0d37 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md @@ -3,9 +3,9 @@ title: "Use cases and examples for the Discovery API" sidebar_label: "Uses and examples" --- -With the Discovery API, you can query the metadata in dbt Cloud to learn more about your dbt deployments and the data it generates to analyze them and make improvements. +With the Discovery API, you can query the metadata in dbt Cloud to learn more about your dbt deployments and the data it generates to analyze them and make improvements. -You can use the API in a variety of ways to get answers to your business questions. Below describes some of the uses of the API and is meant to give you an idea of the questions this API can help you answer. +You can use the API in a variety of ways to get answers to your business questions. Below describes some of the uses of the API and is meant to give you an idea of the questions this API can help you answer. | Use Case | Outcome | Example Questions | | --- | --- | --- | @@ -17,13 +17,13 @@ You can use the API in a variety of ways to get answers to your business questio ## Performance -You can use the Discovery API to identify inefficiencies in pipeline execution to reduce infrastructure costs and improve timeliness. Below are example questions and queries you can run. +You can use the Discovery API to identify inefficiencies in pipeline execution to reduce infrastructure costs and improve timeliness. Below are example questions and queries you can run. For performance use cases, people typically query the historical or latest applied state across any part of the DAG (for example, models) using the `environment`, `modelByEnvironment`, or job-level endpoints. ### How long did each model take to run? -It’s helpful to understand how long it takes to build models (tables) and tests to execute during a dbt run. Longer model build times result in higher infrastructure costs and fresh data arriving later to stakeholders. Analyses like these can be in observability tools or ad-hoc queries, like in a notebook. +It’s helpful to understand how long it takes to build models (tables) and tests to execute during a dbt run. Longer model build times result in higher infrastructure costs and fresh data arriving later to stakeholders. Analyses like these can be in observability tools or ad-hoc queries, like in a notebook. @@ -35,33 +35,42 @@ Data teams can monitor the performance of their models, identify bottlenecks, an 1. Use latest state environment-level API to get a list of all executed models and their execution time. Then, sort the models by `executionTime` in descending order. ```graphql -query Query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first) { - edges { - node { - name - uniqueId - materializedType - executionInfo { - lastSuccessRunId - executionTime - executeStartedAt - } - } - } +query AppliedModels($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + name + uniqueId + materializedType + executionInfo { + lastSuccessRunId + executionTime + executeStartedAt } + } } + } } + } } ``` -2. Get the most recent 20 run results for the longest running model. Review the results of the model across runs, or you can go to the job/run or commit itself to investigate further. +2. Get the most recent 20 run results for the longest running model. Review the results of the model across runs or you can go to the job/run or commit itself to investigate further. ```graphql -query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int!) { - modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId, lastRunCount: $lastRunCount) { +query ModelHistoricalRuns( + $environmentId: BigInt! + $uniqueId: String + $lastRunCount: Int +) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns( + uniqueId: $uniqueId + lastRunCount: $lastRunCount + ) { name runId runElapsedTime @@ -70,12 +79,15 @@ query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int!) { executeStartedAt executeCompletedAt status + } } + } } ``` 3. Use the query results to plot a graph of the longest running model’s historical run time and execution time trends. + ```python # Import libraries import os @@ -88,11 +100,11 @@ auth_token = *[SERVICE_TOKEN_HERE]* # Query the API def query_discovery_api(auth_token, gql_query, variables): - response = requests.post('https://metadata.cloud.getdbt.com/graphql', + response = requests.post('https://metadata.cloud.getdbt.com/graphql', headers={"authorization": "Bearer "+auth_token, "content-type": "application/json"}, json={"query": gql_query, "variables": variables}) data = response.json()['data'] - + return data # Get the latest run metadata for all models @@ -120,7 +132,7 @@ variables_query_two = { } # Get the historical run metadata for the longest running model -model_historical_metadata = query_discovery_api(auth_token, query_two, variables_query_two)['modelByEnvironment'] +model_historical_metadata = query_discovery_api(auth_token, query_two, variables_query_two)['environment']['applied']['modelHistoricalRuns'] # Convert to dataframe model_df = pd.DataFrame(model_historical_metadata) @@ -143,7 +155,8 @@ plt.plot(model_df['executeStartedAt'], model_df['executionTime']) plt.title(model_df['name'].iloc[0]+" Execution Time") plt.show() ``` -Plotting examples: + +Plotting examples: @@ -152,70 +165,91 @@ Plotting examples: -### What’s the latest state of each model? +### What’s the latest state of each model? The Discovery API provides information about the applied state of models and how they arrived in that state. You can retrieve the status information from the most recent run and most recent successful run (execution) from the `environment` endpoint and dive into historical runs using job-based and `modelByEnvironment` endpoints.
    Example query -The API returns full identifier information (`database.schema.alias`) and the `executionInfo` for both the most recent run and most recent successful run from the database: - - - ```graphql - query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first) { - edges { - node { - uniqueId - compiledCode - database - schema - alias - materializedType - executionInfo { - executeCompletedAt - lastJobDefinitionId - lastRunGeneratedAt - lastRunId - lastRunStatus - lastRunError - lastSuccessJobDefinitionId - runGeneratedAt - lastSuccessRunId - } - } - } - } - } - } - } - ``` +The API returns full identifier information (`database.schema.alias`) and the `executionInfo` for both the most recent run and most recent successful run from the database: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + compiledCode + database + schema + alias + materializedType + executionInfo { + executeCompletedAt + lastJobDefinitionId + lastRunGeneratedAt + lastRunId + lastRunStatus + lastRunError + lastSuccessJobDefinitionId + runGeneratedAt + lastSuccessRunId + } + } + } + } + } + } +} +```
    ### What happened with my job run? -You can query the metadata at the job level to review results for specific runs. This is helpful for historical analysis of deployment performance or optimizing particular jobs. +You can query the metadata at the job level to review results for specific runs. This is helpful for historical analysis of deployment performance or optimizing particular jobs. + +import DiscoveryApiJobDeprecationNotice from '/snippets/_discovery_api_job_deprecation_notice.md'; + +
    Example query +Deprecated example: ```graphql -query($jobId: Int!, $runId: Int!){ - models(jobId: $jobId, runId: $runId) { - name - status - tests { - name - status - } - } +query ($jobId: Int!, $runId: Int!) { + models(jobId: $jobId, runId: $runId) { + name + status + tests { + name + status + } + } +} +``` + +New example: + +```graphql +query ($jobId: BigInt!, $runId: BigInt!) { + job(id: $jobId, runId: $runId) { + models { + name + status + tests { + name + status + } + } + } } ``` - +
    ### What’s changed since the last run? @@ -228,41 +262,47 @@ With the API, you can compare the `rawCode` between the definition and applied s ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first, filter: {uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - rawCode - ancestors(types: [Source]){ - ...on SourceAppliedStateNode { - freshness { - maxLoadedAt - } - } - } - executionInfo { - runGeneratedAt - executeCompletedAt - } - materializedType - } - } - } - } - definition { - models(first: $first, filter: {uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - rawCode - runGeneratedAt - materializedType - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + rawCode + ancestors(types: [Source]) { + ... on SourceAppliedStateNestedNode { + freshness { + maxLoadedAt + } + } + } + executionInfo { + runGeneratedAt + executeCompletedAt + } + materializedType + } + } + } + } + definition { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + rawCode + runGeneratedAt + materializedType + } + } + } + } + } } ``` @@ -270,45 +310,46 @@ query($environmentId: Int!, $first: Int!){ ## Quality -You can use the Discovery API to monitor data source freshness and test results to diagnose and resolve issues and drive trust in data. When used with [webhooks](/docs/deploy/webhooks), can also help with detecting, investigating, and alerting issues. Below lists example questions the API can help you answer. Below are example questions and queries you can run. +You can use the Discovery API to monitor data source freshness and test results to diagnose and resolve issues and drive trust in data. When used with [webhooks](/docs/deploy/webhooks), can also help with detecting, investigating, and alerting issues. Below lists example questions the API can help you answer. Below are example questions and queries you can run. -For quality use cases, people typically query the historical or latest applied state, often in the upstream part of the DAG (for example, sources), using the `environment` or `modelByEnvironment` endpoints. +For quality use cases, people typically query the historical or latest applied state, often in the upstream part of the DAG (for example, sources), using the `environment` or `environment { applied { modelHistoricalRuns } }` endpoints. ### Which models and tests failed to run? + By filtering on the latest status, you can get lists of models that failed to build and tests that failed during their most recent execution. This is helpful when diagnosing issues with the deployment that result in delayed or incorrect data.
    Example query with code -1. Get the latest run results across all jobs in the environment and return only the models and tests that errored/failed. +1. Get the latest run results across all jobs in the environment and return only the models and tests that errored/failed. ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first, filter: {lastRunStatus:error}) { - edges { - node { - name - executionInfo { - lastRunId - } - } - } - } - tests(first: $first, filter: {status:"fail"}) { - edges { - node { - name - executionInfo { - lastRunId - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { lastRunStatus: error }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + tests(first: $first, filter: { status: "fail" }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + } + } } ``` @@ -316,14 +357,18 @@ query($environmentId: Int!, $first: Int!){ ```graphql -query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int) { - modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId, lastRunCount: $lastRunCount) { - name - executeStartedAt - status - tests { - name - status +query ($environmentId: BigInt!, $uniqueId: String!, $lastRunCount: Int) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId, lastRunCount: $lastRunCount) { + name + executeStartedAt + status + tests { + name + status + } + } } } } @@ -337,63 +382,67 @@ query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int) { ### When was the data my model uses last refreshed? -You can get the metadata on the latest execution for a particular model or across all models in your project. For instance, investigate when each model or snapshot that's feeding into a given model was last executed or the source or seed was last loaded to gauge the _freshness_ of the data. +You can get the metadata on the latest execution for a particular model or across all models in your project. For instance, investigate when each model or snapshot that's feeding into a given model was last executed or the source or seed was last loaded to gauge the _freshness_ of the data.
    Example query with code ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first,filter:{uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - name - ancestors(types:[Model, Source, Seed, Snapshot]) { - ... on ModelAppliedStateNode { - name - resourceType - materializedType - executionInfo { - executeCompletedAt - } - } - ... on SourceAppliedStateNode { - sourceName - name - resourceType - freshness { - maxLoadedAt - } - } - ... on SnapshotAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt - } - } - ... on SeedAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + name + ancestors(types: [Model, Source, Seed, Snapshot]) { + ... on ModelAppliedStateNestedNode { + name + resourceType + materializedType + executionInfo { + executeCompletedAt + } + } + ... on SourceAppliedStateNestedNode { + sourceName + name + resourceType + freshness { + maxLoadedAt + } + } + ... on SnapshotAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + ... on SeedAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + } + } + } + } + } + } } ``` + ```python # Extract graph nodes from response -def extract_nodes(data): +def extract_nodes(data): models = [] sources = [] groups = [] @@ -422,9 +471,9 @@ def create_freshness_graph(models_df, sources_df): if model["executionInfo"]["executeCompletedAt"] is not None: model_freshness = current_time - pd.Timestamp(model["executionInfo"]["executeCompletedAt"]) for ancestor in model["ancestors"]: - if ancestor["resourceType"] == "SourceAppliedStateNode": + if ancestor["resourceType"] == "SourceAppliedStateNestedNode": ancestor_freshness = current_time - pd.Timestamp(ancestor["freshness"]['maxLoadedAt']) - elif ancestor["resourceType"] == "ModelAppliedStateNode": + elif ancestor["resourceType"] == "ModelAppliedStateNestedNode": ancestor_freshness = current_time - pd.Timestamp(ancestor["executionInfo"]["executeCompletedAt"]) if ancestor_freshness > max_freshness: @@ -437,11 +486,11 @@ def create_freshness_graph(models_df, sources_df): for _, model in models_df.iterrows(): for parent in model["parents"]: G.add_edge(parent["uniqueId"], model["uniqueId"]) - + return G ``` -Graph example: +Graph example: @@ -450,7 +499,7 @@ Graph example: ### Are my data sources fresh? -Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshness) allows you to ensure that sources loaded and used in your dbt project are compliant with expectations. The API provides the latest metadata about source loading and information about the freshness check criteria. +Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshness) allows you to ensure that sources loaded and used in your dbt project are compliant with expectations. The API provides the latest metadata about source loading and information about the freshness check criteria. @@ -458,47 +507,49 @@ Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshne Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - sources(first: $first, filters:{freshnessChecked:true, database:"production"}) { - edges { - node { - sourceName - name - identifier - loader - freshness { - freshnessJobDefinitionId - freshnessRunId - freshnessRunGeneratedAt - freshnessStatus - freshnessChecked - maxLoadedAt - maxLoadedAtTimeAgoInS - snapshottedAt - criteria { - errorAfter { - count - period - } - warnAfter { - count - period - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + sources( + first: $first + filter: { freshnessChecked: true, database: "production" } + ) { + edges { + node { + sourceName + name + identifier + loader + freshness { + freshnessJobDefinitionId + freshnessRunId + freshnessRunGeneratedAt + freshnessStatus + freshnessChecked + maxLoadedAt + maxLoadedAtTimeAgoInS + snapshottedAt + criteria { + errorAfter { + count + period + } + warnAfter { + count + period + } + } + } + } + } + } + } + } } ```
    - ### What’s the test coverage and status? [Tests](https://docs.getdbt.com/docs/build/tests) are an important way to ensure that your stakeholders are reviewing high-quality data. You can execute tests during a dbt Cloud run. The Discovery API provides complete test results for a given environment or job, which it represents as the `children` of a given node that’s been tested (for example, a `model`). @@ -506,32 +557,32 @@ query($environmentId: Int!, $first: Int!){
    Example query -For the following example, the `parents` are the nodes (code) that's being tested and `executionInfo` describes the latest test results: +For the following example, the `parents` are the nodes (code) that's being tested and `executionInfo` describes the latest test results: ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - tests(first: $first) { - edges { - node { - name - columnName - parents { - name - resourceType - } - executionInfo { - lastRunStatus - lastRunError - executeCompletedAt - executionTime - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + tests(first: $first) { + edges { + node { + name + columnName + parents { + name + resourceType + } + executionInfo { + lastRunStatus + lastRunError + executeCompletedAt + executionTime + } + } + } + } + } + } } ``` @@ -541,44 +592,41 @@ query($environmentId: Int!, $first: Int!){ ### How is this model contracted and versioned? -To enforce the shape of a model's definition, you can define contracts on models and their columns. You can also specify model versions to keep track of discrete stages in its evolution and use the appropriate one. +To enforce the shape of a model's definition, you can define contracts on models and their columns. You can also specify model versions to keep track of discrete stages in its evolution and use the appropriate one. + +
    Example query ```graphql -query{ - environment(id:123) { - definition { - models(first:100, filter:{access:public}) { - edges { - nodes { - name - latest_version - contract_enforced - constraints{ - name - type - expression - columns - } - catalog { - columns { - name - type - constraints { - name - type - expression - } - } - } - } - } - } - } - } +query { + environment(id: 123) { + applied { + models(first: 100, filter: { access: public }) { + edges { + node { + name + latestVersion + contractEnforced + constraints { + name + type + expression + columns + } + catalog { + columns { + name + type + } + } + } + } + } + } + } } ``` @@ -594,42 +642,50 @@ For discovery use cases, people typically query the latest applied or definition ### What does this dataset and its columns mean? -Query the Discovery API to map a table/view in the data platform to the model in the dbt project; then, retrieve metadata about its meaning, including descriptive metadata from its YAML file and catalog information from its YAML file and the schema. - +Query the Discovery API to map a table/view in the data platform to the model in the dbt project; then, retrieve metadata about its meaning, including descriptive metadata from its YAML file and catalog information from its YAML file and the schema.
    Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first, filter: {database:"analytics", schema:"prod", identifier:"customers"}) { - edges { - node { - name - description - tags - meta - catalog { - columns { - name - description - type - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { + database: "analytics" + schema: "prod" + identifier: "customers" + } + ) { + edges { + node { + name + description + tags + meta + catalog { + columns { + name + description + type + } + } + } + } + } + } + } } ```
    + + -### Which metrics are available? +### Which metrics are available? -Metric definitions are coming soon to the Discovery API with dbt v1.6. You’ll be able to query metrics using the dbt Semantic Layer, use them for documentation purposes (like for a data catalog), and calculate aggregations (like in a BI tool that doesn’t query the SL). +You can define and query metrics using the [dbt Semantic Layer](/docs/build/about-metricflow), use them for documentation purposes (like for a data catalog), and calculate aggregations (like in a BI tool that doesn’t query the SL). To learn more, refer to [Get started with MetricFlow](/docs/build/sl-getting-started).
    Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - definition { - metrics(first: $first) { - edges { - node { - name - description - type - formula - filter - tags - parents { - name - resourceType - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + metrics(first: $first) { + edges { + node { + name + description + type + formula + filter + tags + parents { + name + resourceType + } + } + } + } + } + } } ``` @@ -912,7 +952,7 @@ query($environmentId: Int!, $first: Int!){ -## Governance +## Governance You can use the Discovery API to audit data development and facilitate collaboration within and between teams. @@ -923,95 +963,98 @@ For governance use cases, people tend to query the latest definition state, ofte You can define and surface the groups each model is associated with. Groups contain information like owner. This can help you identify which team owns certain models and who to contact about them.
    -Example query +Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - model(first: $first, filter:{uniqueIds:["MODEL.PROJECT.NAME"]}) { - edges { - node { - name - description - resourceType - access - group - } - } - } - } - definition { - groups(first: $first) { - edges { - node { - name - resourceType - models { - name - } - owner_name - owner_email - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { uniqueIds: ["MODEL.PROJECT.NAME"] }) { + edges { + node { + name + description + resourceType + access + group + } + } + } + } + definition { + groups(first: $first) { + edges { + node { + name + resourceType + models { + name + } + ownerName + ownerEmail + } + } + } + } + } } ```
    ### Who can use this model? -You can enable users the ability to specify the level of access for a given model. In the future, public models will function like APIs to unify project lineage and enable reuse of models using cross-project refs. +You can enable people the ability to specify the level of access for a given model. In the future, public models will function like APIs to unify project lineage and enable reuse of models using cross-project refs.
    -Example query +Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - definition { - models(first: $first) { - edges { - node { - name - access - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first) { + edges { + node { + name + access + } + } + } + } + } } +``` --- -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - definition { - models(first: $first, filters:{access:public}) { - edges { - node { - name - } - } - } - } - } + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first, filter: { access: public }) { + edges { + node { + name + } + } + } + } + } } ```
    -## Development +## Development You can use the Discovery API to understand dataset changes and usage and gauge impacts to inform project definition. Below are example questions and queries you can run. For development use cases, people typically query the historical or latest definition or applied state across any part of the DAG using the `environment` endpoint. ### How is this model or metric used in downstream tools? -[Exposures](/docs/build/exposures) provide a method to define how a model or metric is actually used in dashboards and other analytics tools and use cases. You can query an exposure’s definition to see how project nodes are used and query its upstream lineage results to understand the state of the data used in it, which powers use cases like a freshness and quality status tile. +[Exposures](/docs/build/exposures) provide a method to define how a model or metric is actually used in dashboards and other analytics tools and use cases. You can query an exposure’s definition to see how project nodes are used and query its upstream lineage results to understand the state of the data used in it, which powers use cases like a freshness and quality status tile. @@ -1019,47 +1062,41 @@ For development use cases, people typically query the historical or latest defin
    Example query -This example reviews an exposure and the models used in it, including when they were last executed and their test results: +Below is an example that reviews an exposure and the models used in it including when they were last executed. ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - exposures(first: $first) { - edges { - node { - name - description - owner_name - url - parents { - name - resourceType - ... on ModelAppliedStateNode { - executionInfo { - executeCompletedAt - lastRunStatus - } - tests { - executionInfo { - executeCompletedAt - lastRunStatus - } - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + exposures(first: $first) { + edges { + node { + name + description + ownerName + url + parents { + name + resourceType + ... on ModelAppliedStateNestedNode { + executionInfo { + executeCompletedAt + lastRunStatus + } + } + } + } + } + } + } + } } ```
    -### How has this model changed over time? -The Discovery API provides historical information about any resource in your project. For instance, you can view how a model has evolved over time (across recent runs) given changes to its shape and contents. +### How has this model changed over time? +The Discovery API provides historical information about any resource in your project. For instance, you can view how a model has evolved over time (across recent runs) given changes to its shape and contents.
    Example query @@ -1067,54 +1104,69 @@ The Discovery API provides historical information about any resource in your pro Review the differences in `compiledCode` or `columns` between runs or plot the “Approximate Size” and “Row Count” `stats` over time: ```graphql -query(environmentId: Int!, uniqueId: String!, lastRunCount: Int!, withCatalog: Boolean!){ - modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId, lastRunCount: $lastRunCount, withCatalog: $withCatalog) { - name - compiledCode - columns { - name - } - stats { - label - value - } - } +query ( + $environmentId: BigInt! + $uniqueId: String! + $lastRunCount: Int! + $withCatalog: Boolean! +) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns( + uniqueId: $uniqueId + lastRunCount: $lastRunCount + withCatalog: $withCatalog + ) { + name + compiledCode + columns { + name + } + stats { + label + value + } + } + } + } } ```
    ### Which nodes depend on this data source? + dbt lineage begins with data sources. For a given source, you can look at which nodes are its children then iterate downstream to get the full list of dependencies. +Currently, querying beyond 1 generation (defined as a direct parent-to-child) is not supported. To see the grandchildren of a node, you need to make two queries: one to get the node and its children, and another to get the children nodes and their children.
    Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - sources(first: $first, filter:{uniqueIds:["SOURCE_NAME.TABLE_NAME"]}) { - edges { - node { - loader - children { - uniqueId - resourceType - ... on ModelAppliedStateNode { - database - schema - alias - children { - uniqueId - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + sources( + first: $first + filter: { uniqueIds: ["SOURCE_NAME.TABLE_NAME"] } + ) { + edges { + node { + loader + children { + uniqueId + resourceType + ... on ModelAppliedStateNestedNode { + database + schema + alias + } + } + } + } + } + } + } } ```
    diff --git a/website/docs/docs/dbt-cloud-apis/project-state.md b/website/docs/docs/dbt-cloud-apis/project-state.md index a5ee71ebb1b..62136b35463 100644 --- a/website/docs/docs/dbt-cloud-apis/project-state.md +++ b/website/docs/docs/dbt-cloud-apis/project-state.md @@ -66,7 +66,7 @@ Most Discovery API use cases will favor the _applied state_ since it pertains to | Seed | Yes | Yes | Yes | Downstream | Applied & definition | | Snapshot | Yes | Yes | Yes | Upstream & downstream | Applied & definition | | Test | Yes | Yes | No | Upstream | Applied & definition | -| Exposure | No | No | No | Upstream | Applied & definition | +| Exposure | No | No | No | Upstream | Definition | | Metric | No | No | No | Upstream & downstream | Definition | | Semantic model | No | No | No | Upstream & downstream | Definition | | Group | No | No | No | Downstream | Definition | diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx new file mode 100644 index 00000000000..d1463f9e9b7 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx @@ -0,0 +1,50 @@ +--- +title: "Model Historical Runs object schema" +sidebar_label: "Model historical runs" +id: "discovery-schema-environment-applied-modelHistoricalRuns" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The model historical runs object allows you to query information about a model's run history. + +The [Example query](#example-query) illustrates a few fields you can query with the `modelHistoricalRuns` object. Refer to [Fields](#fields) to view the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `modelHistoricalRuns`, you can use the following arguments: + + + +### Example query + +You can use the `environmentId` and the model's `uniqueId` to return the model and its execution time for the last 20 times it was run, regardless of which job ran it. + +```graphql +query { + environment(id: 834) { + applied { + modelHistoricalRuns( + uniqueId: "model.marketing.customers" + lastRunCount: 20 + ) { + runId # Get historical results for a particular model + runGeneratedAt + executionTime # View build time across runs + status + tests { + name + status + executeCompletedAt + } # View test results across runs + } + } + } +} +``` + +### Fields + +When querying for `modelHistoricalRuns`, you can use the following fields: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx index 41fd5555c3f..a82bba6576d 100644 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx @@ -4,28 +4,34 @@ sidebar_label: "Environment" id: "discovery-schema-environment" --- -import { ArgsTable, SchemaTable } from "./schema"; +import { QueryArgsTable, SchemaTable } from "./schema"; -This environment object allows you to query information about a particular model based on `environmentId`. +The environment object allows you to query information about a particular model based on `environmentId`. -The [example query](#example-query) illustrates a few fields you can query in this `environment` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. +The [Example queries](#example-queries) illustrate a few fields you can query with this `environment` object. Refer to [Fields](#fields) to view the entire schema, which provides all possible fields you can query. ### Arguments When querying for `environment`, you can use the following arguments. - + +:::caution -### Example Query +dbt Labs is making changes to the Discovery API. These changes will take effect on August 15, 2023. -You can use your production environment's `id`: +The data type `Int` for `id` is being deprecated and will be replaced with `BigInt`. When the time comes, you will need to update your API call accordingly to avoid errors. +::: + +### Example queries + +You can use your production environment's `id`: ```graphql query Example { - environment(id: 834){ # Get the latest state of the production environment + environment(id: 834){ # Get the latest state of the production environment applied { # The state of an executed node as it exists as an object in the database models(first: 100){ # Pagination to ensure manageable response for large projects edges { node { @@ -34,8 +40,8 @@ query Example { executionInfo {executeCompletedAt, executionTime}, # Metadata from when the model was built tests {name, executionInfo{lastRunStatus, lastRunError}}, # Latest test results catalog {columns {name, description, type}, stats {label, value}}, # Catalog info - ancestors(types:[Source]) {name, ...on SourceAppliedStateNode {freshness{maxLoadedAt, freshnessStatus}}}, # Source freshness } - children {name, resourceType}}} # Immediate dependencies in lineage + ancestors(types:[Source]) {name, ...on SourceAppliedStateNode {freshness{maxLoadedAt, freshnessStatus}}}, # Source freshness } + children {name, resourceType}}} # Immediate dependencies in lineage totalCount } # Number of models in the project } definition { # The logical state of a given project node given its most recent manifest generated @@ -48,12 +54,50 @@ query Example { } ``` +With the deprecation of the data type `Int` for `id`, below is an example of replacing it with `BigInt`: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} + +``` + +With the deprecation of `modelByEnvironment`, below is an example of replacing it with `environment`: + +```graphql +query ($environmentId: BigInt!, $uniqueId: String) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } + } + } +} +``` + ### Fields When querying an `environment`, you can use the following fields. -When querying the `applied` field of `environment`, you can use the following fields. +When querying the `applied` field of `environment`, you can use the following fields. When querying the `definition` field of `environment`, you can use the following fields. diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposure.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-exposure.mdx deleted file mode 100644 index d74f12223c5..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposure.mdx +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: "Exposure object schema" -sidebar_label: "Exposure" -id: "discovery-schema-exposure" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The exposure object allows you to query information about a particular exposure. You can learn more about exposures [here](/docs/build/exposures). - -### Arguments - -When querying for an `exposure`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this exposure object. - -### Example Queries -#### Exposure information - -The example query below queries information about an exposure, including the owner's name and email, the url, and information about parent sources and parent models. - -```graphql -{ - exposure(jobId: 123, name: "my_awesome_exposure") { - runId - projectId - name - uniqueId - resourceType - ownerName - url - ownerEmail - parentsSources { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } - parentsModels { - uniqueId - } - } -} -``` - -### Fields -When querying for an `exposure`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposures.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-exposures.mdx deleted file mode 100644 index 5e3dcdd45a9..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposures.mdx +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: "Exposures object schema" -sidebar_label: "Exposures" -id: "discovery-schema-exposures" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The exposures object allows you to query information about all exposures in a given job. You can learn more about exposures [here](/docs/build/exposures). - -### Arguments - -When querying for `exposures`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this exposures object. - -### Example Queries -#### Exposures information - -The example query below queries information about all exposures in a given job, including, for each exposure, the owner's name and email, the url, and information about parent sources and parent models. - -```graphql -{ - exposures(jobId: 123) { - runId - projectId - name - uniqueId - resourceType - ownerName - url - ownerEmail - parentsSources { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } - parentsModels { - uniqueId - } - } -} -``` - -### Fields -When querying for `exposures`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx new file mode 100644 index 00000000000..58855659d05 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx @@ -0,0 +1,64 @@ +--- +title: "Exposure object schema" +sidebar_label: "Exposure" +id: "discovery-schema-job-exposure" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The exposure object allows you to query information about a particular exposure. To learn more, refer to [Add Exposures to your DAG](/docs/build/exposures). + +### Arguments + +When querying for an `exposure`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the exposure object. + +### Example query + +The example below queries information about an exposure including the owner's name and email, the URL, and information about parent sources and parent models. + +```graphql +{ + job(id: 123) { + exposure(name: "my_awesome_exposure") { + runId + projectId + name + uniqueId + resourceType + ownerName + url + ownerEmail + parentsSources { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + parentsModels { + uniqueId + } + } + } +} +``` + +### Fields +When querying for an `exposure`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx new file mode 100644 index 00000000000..b4fe027e324 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx @@ -0,0 +1,65 @@ +--- +title: "Exposures object schema" +sidebar_label: "Exposures" +id: "discovery-schema-job-exposures" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The exposures object allows you to query information about all exposures in a given job. To learn more, refer to [Add Exposures to your DAG](/docs/build/exposures). + + +### Arguments + +When querying for `exposures`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the exposures object. + +### Example query + +The example below queries information about all exposures in a given job including the owner's name and email, the URL, and information about parent sources and parent models for each exposure. + +```graphql +{ + job(id: 123) { + exposures(jobId: 123) { + runId + projectId + name + uniqueId + resourceType + ownerName + url + ownerEmail + parentsSources { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + parentsModels { + uniqueId + } + } + } +} +``` + +### Fields +When querying for `exposures`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx new file mode 100644 index 00000000000..3a8a52a19cb --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx @@ -0,0 +1,58 @@ +--- +title: "Metric object schema" +sidebar_label: "Metric" +id: "discovery-schema-job-metric" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The metric object allows you to query information about [metrics](/docs/build/metrics). + +### Arguments + +When querying for a `metric`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the metric object. + +### Example query + +The example query below outputs information about a metric. You can also add any field from the model endpoint (the example simply selects name). This includes schema, database, uniqueId, columns, and more. For details, refer to [Model object schema](/docs/dbt-cloud-apis/discovery-schema-job-model). + + +```graphql +{ + job(id: 123) { + metric(uniqueId: "metric.jaffle_shop.new_customers") { + uniqueId + name + packageName + tags + label + runId + description + type + sql + timestamp + timeGrains + dimensions + meta + resourceType + filters { + field + operator + value + } + model { + name + } + } + } +} +``` + +### Fields +When querying for a `metric`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx new file mode 100644 index 00000000000..174dd5b676a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx @@ -0,0 +1,60 @@ +--- +title: "Metrics object schema" +sidebar_label: "Metrics" +id: "discovery-schema-job-metrics" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The metrics object allows you to query information about [metrics](/docs/build/metrics). + + +### Arguments + +When querying for `metrics`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the metrics object. + +### Example query + +The example query returns information about all metrics for the given job. + +```graphql +{ + job(id: 123) { + metrics { + uniqueId + name + packageName + tags + label + runId + description + type + sql + timestamp + timeGrains + dimensions + meta + resourceType + filters { + field + operator + value + } + model { + name + } + } + } +} +``` + +### Fields +The metrics object can access the _same fields_ as the [metric node](/docs/dbt-cloud-apis/discovery-schema-job-metric). The difference is that the metrics object can output a list so instead of querying for fields for one specific metric, you can query for those parameters for all metrics in a run. + +When querying for `metrics`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx new file mode 100644 index 00000000000..abd1ca1b1d6 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx @@ -0,0 +1,91 @@ +--- +title: "Model object schema" +sidebar_label: "Model" +id: "discovery-schema-job-model" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The model object allows you to query information about a particular model in a given job. + +### Arguments + +When querying for a `model`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the model object. + +### Example query for finding parent models and sources + +The example query below uses the `parentsModels` and `parentsSources` fields to fetch information about a model’s parent models and parent sources. The jobID and uniqueID fields are placeholders that you will need to replace with your own values. + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + parentsModels { + runId + uniqueId + executionTime + } + parentsSources { + runId + uniqueId + state + } + } + } +} + +``` + +### Example query for model timing + +The example query below could be useful if you want to understand information around execution timing on a given model (start, end, completion). + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + runId + projectId + name + uniqueId + resourceType + executeStartedAt + executeCompletedAt + executionTime + } + } +} +``` + +### Example query for column-level information + +You can use the following example query to understand more about the columns of a given model. This query will only work if the job has generated documentation; that is, it will work with the command `dbt docs generate`. + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + columns { + name + index + type + comment + description + tags + meta + } + } + } +} +``` + + +### Fields + +When querying for a `model`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx new file mode 100644 index 00000000000..ee512f3cd97 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx @@ -0,0 +1,59 @@ +--- +title: "Models object schema" +sidebar_label: "Models" +id: "discovery-schema-job-models" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + + +The models object allows you to query information about all models in a given job. + +### Arguments + +When querying for `models`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the models object. + +### Example queries +The database, schema, and identifier arguments are all optional. This means that with this endpoint you can: + +- Find a specific model by providing `..` +- Find all of the models in a database and/or schema by providing `` and/or `` + +#### Find models by their database, schema, and identifier +The example query below finds a model by its unique database, schema, and identifier. + +```graphql +{ + job(id: 123) { + models(database:"analytics", schema: "analytics", identifier:"dim_customers") { + uniqueId + } + } +} +``` + +#### Find models by their schema +The example query below finds all models in this schema and their respective execution times. + +```graphql +{ + job(id: 123) { + models(schema: "analytics") { + uniqueId + executionTime + } + } +} +``` + + +### Fields +The models object can access the _same fields_ as the [Model node](/docs/dbt-cloud-apis/discovery-schema-job-model). The difference is that the models object can output a list so instead of querying for fields for one specific model, you can query for those parameters for all models within a jobID, database, and so on. + +When querying for `models`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx new file mode 100644 index 00000000000..924e3e87e91 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx @@ -0,0 +1,42 @@ +--- +title: "Seed object schema" +sidebar_label: "Seed" +id: "discovery-schema-job-seed" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The seed object allows you to query information about a particular seed in a given job. + +### Arguments + +When querying for a `seed`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the seed object. + +### Example query + +The example query below pulls relevant information about a given seed. For instance, you can view the load time. + +```graphql +{ + job(id: 123) { + seed(uniqueId: "seed.jaffle_shop.raw_customers") { + database + schema + uniqueId + name + status + error + } + } +} +``` + +### Fields + +When querying for a `seed`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx new file mode 100644 index 00000000000..6ed45216e5f --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx @@ -0,0 +1,40 @@ +--- +title: "Seeds object schema" +sidebar_label: "Seeds" +id: "discovery-schema-job-seeds" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The seeds object allows you to query information about all seeds in a given job. + +### Arguments + +When querying for `seeds`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the seeds object. + +### Example query + +The example query below pulls relevant information about all seeds in a given job. For instance, you can view load times. + +```graphql +{ + job(id: 123) { + seeds { + uniqueId + name + executionTime + status + } + } +} +``` + +### Fields + +When querying for `seeds`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx new file mode 100644 index 00000000000..a57163e0554 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx @@ -0,0 +1,49 @@ +--- +title: "Snapshots object schema" +sidebar_label: "Snapshots" +id: "discovery-schema-job-snapshots" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The snapshots object allows you to query information about all snapshots in a given job. + +### Arguments + +When querying for `snapshots`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the snapshots object. + +### Example query + +The database, schema, and identifier arguments are optional. This means that with this endpoint you can: + +- Find a specific snapshot by providing `..` +- Find all of the snapshots in a database and/or schema by providing `` and/or `` + +#### Find snapshots information for a job + +The example query returns information about all snapshots in this job. + +```graphql +{ + job(id: 123) { + snapshots { + uniqueId + name + executionTime + environmentId + executeStartedAt + executeCompletedAt + } + } +} +``` + +### Fields + +When querying for `snapshots`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx new file mode 100644 index 00000000000..972e929f4cd --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx @@ -0,0 +1,52 @@ +--- +title: "Source object schema" +sidebar_label: "Source" +id: "discovery-schema-job-source" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The source object allows you to query information about a particular source in a given job. + +### Arguments + +When querying for a `source`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the source object. + +### Example query + +The query below pulls relevant information about a given source. For instance, you can view the load time and the state (pass, fail, error) of that source. + +```graphql +{ + job(id: 123) { + source(uniqueId: "source.jaffle_shop.snowplow.event") { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + } +} +``` + +### Fields + +When querying for a `source`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx new file mode 100644 index 00000000000..97f717d269a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx @@ -0,0 +1,65 @@ +--- +title: "Sources object schema" +sidebar_label: "Sources" +id: "discovery-schema-job-sources" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The sources object allows you to query information about all sources in a given job. + +### Arguments + +When querying for `sources`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the sources object. + +### Example queries + +The database, schema, and identifier arguments are optional. This means that with this endpoint you can: + +- Find a specific source by providing `..` +- Find all of the sources in a database and/or schema by providing `` and/or `` + +#### Finding sources by their database, schema, and identifier + +The example query below finds a source by its unique database, schema, and identifier. + +```graphql +{ + job(id: 123) { + sources( + database: "analytics" + schema: "analytics" + identifier: "dim_customers" + ) { + uniqueId + } + } +} +``` + +#### Finding sources by their schema + +The example query below finds all sources in this schema and their respective states (pass, error, fail). + +```graphql +{ + job(id: 123) { + sources(schema: "analytics") { + uniqueId + state + } + } +} +``` + +### Fields + +The sources object can access the _same fields_ as the [source node](/docs/dbt-cloud-apis/discovery-schema-job-source). The difference is that the sources object can output a list so instead of querying for fields for one specific source, you can query for those parameters for all sources within a jobID, database, and so on. + +When querying for `sources`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx new file mode 100644 index 00000000000..c52aa49ab93 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx @@ -0,0 +1,43 @@ +--- +title: "Test object schema" +sidebar_label: "Test" +id: "discovery-schema-job-test" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The test object allows you to query information about a particular test. + +### Arguments + +When querying for a `test`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the test object. + +### Example query + +The example query below outputs information about a test including the state of the test result. In order of severity, the result can be one of these: "error", "fail", "warn", or "pass". + +```graphql +{ + job(id: 123) { + test(uniqueId: "test.internal_analytics.not_null_metrics_id") { + runId + accountId + projectId + uniqueId + name + columnName + state + } + } +} +``` + +### Fields + +When querying for a `test`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx new file mode 100644 index 00000000000..efcef674c55 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx @@ -0,0 +1,43 @@ +--- +title: "Tests object schema" +sidebar_label: "Tests" +id: "discovery-schema-job-tests" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The tests object allows you to query information about all tests in a given job. + +### Arguments + +When querying for `tests`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the tests object. + +### Example query + +The example query below finds all tests in this job and includes information about those tests. + +```graphql +{ + job(id: 123) { + tests { + runId + accountId + projectId + uniqueId + name + columnName + state + } + } +} +``` + +### Fields + +When querying for `tests`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx new file mode 100644 index 00000000000..8b02c5601ad --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx @@ -0,0 +1,64 @@ +--- +title: "Job object schema" +sidebar_label: "Job" +id: "discovery-schema-job" +pagination_next: "docs/dbt-cloud-apis/discovery-schema-job-model" +pagination_prev: null +--- + +import { QueryArgsTable, SchemaTable } from "./schema"; + +The job object allows you to query information about a particular model based on `jobId` and, optionally, a `runId`. + +If you don't provide a `runId`, the API returns information on the latest runId of a job. + +The [example query](#example-query) illustrates a few fields you can query in this `job` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `job`, you can use the following arguments. + + + + +### Example Query + +You can use your production job's `id`. + +```graphql +query JobQueryExample { + # Provide runId for looking at specific run, otherwise it defaults to latest run + job(id: 940) { + # Get all models from this job's latest run + models(schema: "analytics") { + uniqueId + executionTime + } + + # Or query a single node + source(uniqueId: "source.jaffle_shop.snowplow.event") { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + } +} +``` + +### Fields +When querying an `job`, you can use the following fields. + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-metric.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-metric.mdx deleted file mode 100644 index 2280c6f7802..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-metric.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Metric object schema" -sidebar_label: "Metric" -id: "discovery-schema-metric" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The metric object allows you to query information about [metrics](/docs/build/metrics). - -### Arguments - -When querying for a `metric`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this metric object. - -### Example Queries -#### Metric information - -The example query below outputs information about a metric. Note that you can also add any field from the Model endpoint -- here we are simply selecting name. This includes schema, database, uniqueId, columns and more -- find documentation [here](/docs/dbt-cloud-apis/discovery-schema-model). - - -```graphql -{ - metric(jobId: 123, uniqueId: "metric.jaffle_shop.new_customers") { - uniqueId - name - packageName - tags - label - runId - description - type - sql - timestamp - timeGrains - dimensions - meta - resourceType - filters { - field - operator - value - } - model { - name - } - } -} - -``` - -### Fields -When querying for a `metric`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-metrics.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-metrics.mdx deleted file mode 100644 index 5242eb717dc..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-metrics.mdx +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: "Metrics object schema" -sidebar_label: "Metrics" -id: "discovery-schema-metrics" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The metrics object allows you to query information about [metrics](/docs/build/metrics). - -### Arguments - -When querying for `metrics`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this metrics object. - -### Example Queries -#### Metrics information - -The example query returns information about all metrics in this job. - -```graphql -{ - metrics(jobId: 123) { - uniqueId - name - packageName - tags - label - runId - description - type - sql - timestamp - timeGrains - dimensions - meta - resourceType - filters { - field - operator - value - } - model { - name - } - } -} - -``` - -### Fields -metrics has access to the *same fields* as the [metric node](/docs/dbt-cloud-apis/discovery-schema-metric). The difference is that metrics can output a list, so instead of querying for fields for one specific metric, you can query for those parameters for all metrics in a run. - -When querying for `metrics`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-model.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-model.mdx deleted file mode 100644 index 3fb43edaded..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-model.mdx +++ /dev/null @@ -1,84 +0,0 @@ ---- -title: "Model object schema" -sidebar_label: "Model" -id: "discovery-schema-model" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The model object allows you to query information about a particular model in a given job. - -### Arguments - -When querying for a `model`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this model object. - -### Example Queries -#### Finding parent models and sources - -The example query below uses the `parentsModels` and `parentsSources` fields to fetch information about a model’s parent models and parent sources. Note that we put a placeholder jobID and uniqueID, which you will have to replace. - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - parentsModels { - runId - uniqueId - executionTime - } - parentsSources { - runId - uniqueId - state - } - } -} -``` - -#### Model Timing - -The example query below could be useful if we wanted to understand information around execution timing on a given model (start, end, completion). - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - runId - projectId - name - uniqueId - resourceType - executeStartedAt - executeCompletedAt - executionTime - } -} -``` - -#### Column-level information - -You can use the following example query to understand more about the columns of a given model. Note that this will only work if the job has generated documentation. For example it will work with the command `dbt docs generate`. - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - columns{ - name - index - type - comment - description - tags - meta - } - } -} -``` - - -### Fields -When querying for a `model`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-modelByEnv.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-modelByEnv.mdx deleted file mode 100644 index 078d2512256..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-modelByEnv.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Model by environment object schema" -sidebar_label: "Model by environment" -id: "discovery-schema-modelByEnv" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - - -This model by environment object allows you to query information about a particular model based on `environmentId`. - -The [example query](#example-query) illustrates a few fields you can query in this `modelByEnvironment` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. - -### Arguments - -When querying for `modelByEnvironment`, you can use the following arguments. - - - - -### Example Query - -You can use the `environment_id` and `model_unique_id` to return the model and its execution time for the last 20 times it was run, regardless of which job ran it. - - -```graphql -query{ - modelByEnvironment(environmentId: 834, uniqueId: "model.marketing.customers", lastRunCount: 20) { - runId, # Get historical results for a particular model - runGeneratedAt, - executionTime, # View build time across runs - status, - tests { name, status, executeCompletedAt } # View test results across runs - } -} -``` - -### Fields -When querying for `modelByEnvironment`, you can use the following fields. - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-models.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-models.mdx deleted file mode 100644 index a3215eee039..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-models.mdx +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Models object schema" -sidebar_label: "Models" -id: "discovery-schema-models" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - -The models object allows you to query information about all models in a given job. - -### Arguments - -When querying for `models`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this models object. - -### Example Queries -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific model by providing `..` -- Find all of the models in a database and/or schema by providing `` and/or `` - -#### Finding models by their database, schema, and identifier -The example query below finds a model by its unique database, schema, and identifier. - -```graphql -{ - models(jobId: 123, database:"analytics", schema: "analytics", identifier:"dim_customers") { - uniqueId - } -} -``` - -#### Finding models by their schema -The example query below finds all models in this schema, and their respective execution times. - -```graphql -{ - models(jobId: 123, schema: "analytics") { - uniqueId - executionTime - } -} -``` - - -### Fields -Models has access to the *same fields* as the [Model node](/docs/dbt-cloud-apis/discovery-schema-model). The difference is that Models can output a list, so instead of querying for fields for one specific model, you can query for those parameters for all models within a jobID, database, etc. - -When querying for `models`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-seed.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-seed.mdx deleted file mode 100644 index 1047545a8be..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-seed.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Seed object schema" -sidebar_label: "Seed" -id: "discovery-schema-seed" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The seed object allows you to query information about a particular seed in a given job. - -### Arguments - -When querying for a `seed`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this seed object. - -### Example Queries - -#### Seed information - -The query below pulls relevant information about a given seed. For example, we could see the load time. - -```graphql -{ - seed(jobId: 123, uniqueId: "seed.jaffle_shop.raw_customers") { - database - schema - uniqueId - name - status - error - } -} -``` - -### Fields - -When querying for a `seed`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-seeds.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-seeds.mdx deleted file mode 100644 index 2cee2b8aa3f..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-seeds.mdx +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: "Seeds object schema" -sidebar_label: "Seeds" -id: "discovery-schema-seeds" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The seeds object allows you to query information about a all seeds in a given job. - -### Arguments - -When querying for `seeds`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this seeds object. - -### Example Queries -#### Seeds information - -The query below pulls relevant information about all seeds in a given job. For example, we could see the load times. - -```graphql -{ - seeds(jobId: 123) { - uniqueId - name - executionTime - status - } -} -``` - -### Fields - -When querying for `seeds`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-snapshots.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-snapshots.mdx deleted file mode 100644 index b3f7071319f..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-snapshots.mdx +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: "Snapshots object schema" -sidebar_label: "Snapshots" -id: "discovery-schema-snapshots" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The snapshots object allows you to query information about all snapshots in a given job. - -### Arguments - -When querying for `snapshots`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this snapshots object. - -### Example Query -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific snapshot by providing `..` -- Find all of the snapshots in a database and/or schema by providing `` and/or `` - -#### Finding snapshots information for a job -The example query returns information about all snapshots in this job. - -```graphql -{ - snapshots(jobId: 123) { - uniqueId - name - executionTime - environmentId - executeStartedAt - executeCompletedAt - } -} - -``` - -### Fields -Snapshots has access to the *same fields* as the [Snapshot node](/docs/dbt-cloud-apis/discovery-schema-snapshots). The difference is that Snapshots can output a list, so instead of querying for fields for one specific snapshot, you can query for those parameters for all snapshots within a jobID, database, etc. - -When querying for `snapshots`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-source.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-source.mdx deleted file mode 100644 index 87d776282fe..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-source.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Source object schema" -sidebar_label: "Source" -id: "discovery-schema-source" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The source object allows you to query information about a particular source in a given job. - -### Arguments - -When querying for a `source`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this source object. - -### Example Queries - -#### Source information - -The query below pulls relevant information about a given source. For example, we could see the load time and the state (“pass”, “fail”, “error”) of that source. - -```graphql -{ - source(jobId: 123, uniqueId: "source.jaffle_shop.snowplow.event") { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } -} -``` - -### Fields - -When querying for a `source`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-sources.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-sources.mdx deleted file mode 100644 index a719c5caf92..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-sources.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: "Sources object schema" -sidebar_label: "Sources" -id: "discovery-schema-sources" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - -The sources object allows you to query information about all sources in a given job. - -### Arguments - -When querying for `sources`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this sources object. - -### Example Queries -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific source by providing `..` -- Find all of the sources in a database and/or schema by providing `` and/or `` - -#### Finding sources by their database, schema, and identifier -The example query below finds a source by its unique database, schema, and identifier. - -```graphql -{ - sources(jobId: 123, database:"analytics", schema: "analytics", identifier:"dim_customers") { - uniqueId - } -} -``` - -#### Finding sources by their schema -The example query below finds all sources in this schema, and their respective states (pass, error, fail). - -```graphql -{ - sources(jobId: 123, schema: "analytics") { - uniqueId - state - } -} -``` - -### Fields -Sources has access to the *same fields* as the [Source node](/docs/dbt-cloud-apis/discovery-schema-source). The difference is that Sources can output a list, so instead of querying for fields for one specific source, you can query for those parameters for all sources within a jobID, database, etc. - -When querying for `sources`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-test.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-test.mdx deleted file mode 100644 index 2ee915d27c7..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-test.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Test object schema" -sidebar_label: "Test" -id: "discovery-schema-test" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The test object allows you to query information about a particular test. - -### Arguments - -When querying for a `test`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this test object. - -### Example Queries -#### Test result - -The example query below outputs information about a test, including the state of the test result. This can be one of, in order of severity, "error", "fail", "warn", "pass." - -```graphql -{ - test(jobId: 123, uniqueId: "test.internal_analytics.not_null_metrics_id") { - runId - accountId - projectId - uniqueId - name - columnName - state - } -} -``` - -### Fields -When querying for a `test`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-tests.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-tests.mdx deleted file mode 100644 index 7f087c85fee..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-tests.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Tests object schema" -sidebar_label: "Tests" -id: "discovery-schema-tests" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The tests object allows you to query information about all tests in a given job. - - -### Arguments - -When querying for `tests`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this tests object. - -### Example Queries -#### Tests result - -The example query below finds all tests in this job, and includes information about those tests. - -```graphql -{ - tests(jobId: 123) { - runId - accountId - projectId - uniqueId - name - columnName - state - } -} -``` - -### Fields -When querying for `tests`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema.jsx b/website/docs/docs/dbt-cloud-apis/schema.jsx index 8b9bbc358f0..31568671573 100644 --- a/website/docs/docs/dbt-cloud-apis/schema.jsx +++ b/website/docs/docs/dbt-cloud-apis/schema.jsx @@ -1,9 +1,55 @@ -import React, { setState } from "react"; +import React from "react"; import { useState, useEffect } from 'react' -const queriesQuery = `{ + +const getTypeString = (typeStructure) => { + // Helper function to represent GraphQL type + if (!typeStructure) return '' + + if (typeStructure.kind === 'NON_NULL') { + return `${getTypeString(typeStructure.ofType)}!`; + } else if (typeStructure.kind === 'LIST') { + return `[${getTypeString(typeStructure.ofType)}]`; + } else if (['OBJECT', 'SCALAR', 'ENUM'].includes(typeStructure.kind)) { + return `${typeStructure.name}${getTypeString(typeStructure.ofType)}`; + } else { + return ''; + } +} + +export const ArgsTable = ({ data, name }) => { + return ( + + + + + + + + + + + {data.fields.find(d => d.name === name).args.map(function ({ name, description, type }) { + return ( + + + + + + + ) + })} + +
    FieldTypeRequired?Description
    {name}{getTypeString(type)}{type.kind === 'NON_NULL' ? `Yes` : `No`}{description || `No description provided`}
    + ) +} + +const metadataUrl = 'https://metadata.cloud.getdbt.com/graphql' +const metadataBetaUrl = 'https://metadata.cloud.getdbt.com/beta/graphql' + +const queryArgsQuery = `{ __schema { queryType { - fields { + fields(includeDeprecated: true) { name type { name @@ -18,23 +64,22 @@ const queriesQuery = `{ name description kind - ofType { name description } + ofType { kind name description } } } } } } }` -const metadataUrl = 'https://metadata.cloud.getdbt.com/graphql' -const metadataBetaUrl = 'https://metadata.cloud.getdbt.com/beta/graphql' -export const ArgsTable = ({ queryName, useBetaAPI }) => { + +export const QueryArgsTable = ({ queryName, useBetaAPI }) => { const [data, setData] = useState(null) useEffect(() => { const fetchData = () => { fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query: queriesQuery }), + body: JSON.stringify({ query: queryArgsQuery }), }) .then((result) => result.json()) .then((data) => setData(data)) @@ -45,33 +90,89 @@ export const ArgsTable = ({ queryName, useBetaAPI }) => { return

    Fetching data...

    } return ( - - - - - - - - - - - {data.data.__schema.queryType.fields.find(d => d.name === queryName).args.map(function ({ name, description, type }) { - return ( - - - {type.ofType ? - : - + + ) +} + +export const NodeArgsTable = ({ parent, name, useBetaAPI }) => { + const [data, setData] = useState(null) + useEffect(() => { + const fetchData = () => { + fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query: ` + query { + __type(name: "${parent}") { + ...FullType + } + } + + fragment FullType on __Type { + kind + fields(includeDeprecated: true) { + name + description + args { + name + description + defaultValue + type { + ...TypeRef + } } - - - - ) - })} - -
    FieldTypeRequired?Description
    {name}{type.ofType.name}{type.name}{type.kind === 'NON_NULL' ? `Yes` : `No`}{description || `No description provided`}
    + } + } + + # get several levels + fragment TypeRef on __Type { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + } + } + } + } + } + } + } + } + `}) + }) + .then((result) => result.json()) + .then((data) => setData(data)) + } + fetchData() + }, []) + if (!data) { + return

    Fetching data...

    + } + return ( + ) } + export const SchemaTable = ({ nodeName, useBetaAPI }) => { const [data, setData] = useState(null) useEffect(() => { @@ -80,27 +181,60 @@ export const SchemaTable = ({ nodeName, useBetaAPI }) => { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - query: `{ - __type(name: "${nodeName}") { - fields { + query: ` + query { + __type(name: "${nodeName}") { + ...FullType + } + } + + fragment FullType on __Type { + kind + name + description + fields(includeDeprecated: true) { name description - type { - name - description - kind - ofType { - name - description - ofType { - name - description - } - } + type { + ...TypeRef } } } - }`}), + + # get several levels + fragment TypeRef on __Type { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + } + } + } + } + } + } + } + } + `}), }) .then((result) => result.json()) .then((data) => setData(data)) @@ -124,13 +258,7 @@ export const SchemaTable = ({ nodeName, useBetaAPI }) => { return ( {name} - {type.kind === 'LIST' ? - [{type.ofType.ofType ? type.ofType.ofType.name : type.ofType.name}] : - (type.ofType ? - {type.ofType.name} : - {type.name} - ) - } + {getTypeString(type)} {description} ) @@ -138,4 +266,4 @@ export const SchemaTable = ({ nodeName, useBetaAPI }) => { ) -} \ No newline at end of file +} diff --git a/website/docs/docs/dbt-cloud-apis/service-tokens.md b/website/docs/docs/dbt-cloud-apis/service-tokens.md index 811bfaea29d..9553f48a013 100644 --- a/website/docs/docs/dbt-cloud-apis/service-tokens.md +++ b/website/docs/docs/dbt-cloud-apis/service-tokens.md @@ -9,8 +9,6 @@ If you have service tokens created on or before July 18, 2023, please read [this ::: -## About service tokens - Service account tokens enable you to securely authenticate with the dbt Cloud API by assigning each token a narrow set of permissions that more precisely manages access to the API. While similar to [User API tokens](user-tokens), service account tokens belong to an account rather than a user. You can use service account tokens for system-level integrations that do not run on behalf of any one user. Assign any permission sets available in dbt Cloud to your service account token, which can vary slightly depending on your plan: @@ -20,9 +18,9 @@ You can use service account tokens for system-level integrations that do not run You can assign as many permission sets as needed to one token. For more on permissions sets, see "[Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions)." -## Generating service account tokens +## Generate service account tokens -To make a service token in dbt Cloud, follow these steps: +You can generate service tokens if you have a Developer [license](/docs/cloud/manage-access/seats-and-users) and account admin [permissions](/docs/cloud/manage-access/about-user-access#permission-sets). To create a service token in dbt Cloud, follow these steps: 1. Open the **Account Settings** page by clicking the gear icon on the right-hand side. 2. On the left sidebar, click on **Service Tokens**. @@ -43,6 +41,9 @@ Account Admin service tokens have full `read + write` access to an account, so p **Metadata Only**
    Metadata-only service tokens authorize requests to the Discovery API. +**Semantic Layer Only**
    +Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. + **Job Admin**
    Job admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. @@ -68,6 +69,9 @@ Billing Admin service tokens have certain account-level permissions. For more o **Metadata Only**
    Metadata-only service tokens authorize requests to the Discovery API. +**Semantic Layer Only**
    +Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. + **Job Admin**
    Job Admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. For more on these permissions, see [Job Admin](/docs/cloud/manage-access/enterprise-permissions#job-admin). diff --git a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md new file mode 100644 index 00000000000..3ddbf76d152 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md @@ -0,0 +1,61 @@ +--- +title: "Semantic Layer APIs" +id: sl-api-overview +description: "Integrate and query metrics and dimensions in downstream tools using the Semantic Layer APIs" +tags: [Semantic Layer, API] +hide_table_of_contents: true +pagination_next: "docs/dbt-cloud-apis/sl-jdbc" +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +The rapid growth of different tools in the modern data stack has helped data professionals address the diverse needs of different teams. The downside of this growth is the fragmentation of business logic across teams, tools, and workloads. + +The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) allows you to define metrics in code (with [MetricFlow](/docs/build/about-metricflow)) and dynamically generate and query datasets in downstream tools based on their dbt governed assets, such as metrics and models. Integrating with the dbt Semantic Layer will help organizations that use your product make more efficient and trustworthy decisions with their data. It also helps you to avoid duplicative coding, optimize development workflow, ensure data governance, and guarantee consistency for data consumers. + +You can use the dbt Semantic Layer for a variety of tools and applications of data. Some common use cases are: + +* Business intelligence (BI), reporting, and analytics +* Data quality and monitoring +* Governance and privacy +* Data discovery and cataloging +* Machine learning and data science + + + +import Features from '/snippets/_sl-plan-info.md' + + + +
    + + + + + + + +
    + + diff --git a/website/docs/docs/dbt-cloud-apis/sl-graphql.md b/website/docs/docs/dbt-cloud-apis/sl-graphql.md new file mode 100644 index 00000000000..f73007c9a02 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-graphql.md @@ -0,0 +1,462 @@ +--- +title: "GraphQL" +id: sl-graphql +description: "Integrate and use the GraphQL API to query your metrics." +tags: [Semantic Layer, APIs] +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + + +[GraphQL](https://graphql.org/) (GQL) is an open-source query language for APIs. It offers a more efficient and flexible approach compared to traditional RESTful APIs. + +With GraphQL, users can request specific data using a single query, reducing the need for many server round trips. This improves performance and minimizes network overhead. + +GraphQL has several advantages, such as self-documenting, having a strong typing system, supporting versioning and evolution, enabling rapid development, and having a robust ecosystem. These features make GraphQL a powerful choice for APIs prioritizing flexibility, performance, and developer productivity. + +## dbt Semantic Layer GraphQL API + +The dbt Semantic Layer GraphQL API allows you to explore and query metrics and dimensions. Due to its self-documenting nature, you can explore the calls conveniently through the [schema explorer](https://semantic-layer.cloud.getdbt.com/api/graphql). + +dbt Partners can use the Semantic Layer GraphQL API to build an integration with the dbt Semantic Layer. + +## Requirements to use the GraphQL API +- A dbt Cloud project on dbt v1.6 or higher +- Metrics are defined and configured +- A dbt Cloud [service token](/docs/dbt-cloud-apis/service-tokens) with "Semantic Layer Only” and "Metadata Only" permissions +- Your dbt project is configured and connected to a data platform + + +## Using the GraphQL API + +If you're a dbt user or partner with access to dbt Cloud and the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), you can [setup](/docs/use-dbt-semantic-layer/setup-sl) and test this API with data from your own instance by configuring the Semantic Layer and obtaining the right GQL connection parameters described in this document. + +Refer to [Get started with the dbt Semantic Layer](docs/use-dbt-semantic-layer/quickstart-sl) for more info. + + +### Authentication + +Authentication uses a dbt Cloud [service account tokens](/docs/dbt-cloud-apis/service-tokens) passed through a header as follows. To explore the schema, you can enter this information in the "header" section. + +``` +{"Authorization": "Bearer "} +``` + +Each GQL request also requires a dbt Cloud `environmentId`. The API uses both the service token in the header and environmentId for authentication. + +### Metadata calls + +**Fetch data platform dialect** + +In some cases in your application, it may be useful to know the dialect or data platform that's internally used for the dbt Semantic Layer connection (such as if you are building `where` filters from a user interface rather than user-inputted SQL). + +The GraphQL API has an easy way to fetch this with the following query: + +```graphql +{ + environmentInfo(environmentId: BigInt!) { + dialect + } +} +``` + +**Fetch available metrics** + +```graphql +metrics(environmentId: BigInt!): [Metric!]! +``` + +**Fetch available dimensions for metrics** + +```graphql +dimensions( + environmentId: BigInt! + metrics: [MetricInput!]! +): [Dimension!]! +``` + +**Fetch available granularities given metrics** + +Note: This call for `queryableGranularities` returns only queryable granularities for metric time - the primary time dimension across all metrics selected. + +```graphql +queryableGranularities( + environmentId: BigInt! + metrics: [MetricInput!]! +): [TimeGranularity!]! +``` + +You can also get queryable granularities for all other dimensions using the `dimensions` call: + +```graphql +{ + dimensions(environmentId: BigInt!, metrics:[{name:"order_total"}]) { + name + queryableGranularities # --> ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"] + } +} +``` + +You can also optionally access it from the metrics endpoint: + +```graphql +{ + metrics(environmentId: BigInt!) { + name + dimensions { + name + queryableGranularities + } + } +} +``` + +**Fetch measures** + +```graphql +{ + measures(environmentId: BigInt!, metrics: [{name:"order_total"}]) { + name + aggTimeDimension + } +} +``` + +`aggTimeDimension` tells you the name of the dimension that maps to `metric_time` for a given measure. You can also query `measures` from the `metrics` endpoint, which allows you to see what dimensions map to `metric_time` for a given metric: + +```graphql +{ + metrics(environmentId: BigInt!) { + measures { + name + aggTimeDimension + } + } +} +``` + +**Fetch available metrics given a set of dimensions** + +```graphql +metricsForDimensions( + environmentId: BigInt! + dimensions: [GroupByInput!]! +): [Metric!]! +``` + +**Create Dimension Values query** + +```graphql + +mutation createDimensionValuesQuery( + environmentId: BigInt! + metrics: [MetricInput!] + groupBy: [GroupByInput!]! +): CreateDimensionValuesQueryResult! + +``` + +**Create Metric query** + +```graphql +createQuery( + environmentId: BigInt! + metrics: [MetricInput!]! + groupBy: [GroupByInput!] = null + limit: Int = null + where: [WhereInput!] = null + order: [OrderByInput!] = null +): CreateQueryResult +``` + +```graphql +MetricInput { + name: String! +} + +GroupByInput { + name: String! + grain: TimeGranularity = null +} + +WhereInput { + sql: String! +} + +OrderByinput { # -- pass one and only one of metric or groupBy + metric: MetricInput = null + groupBy: GroupByInput = null + descending: Boolean! = false +} +``` + +**Fetch query result** + +```graphql +query( + environmentId: BigInt! + queryId: String! +): QueryResult! +``` + +**Metric Types** + +```graphql +Metric { + name: String! + description: String + type: MetricType! + typeParams: MetricTypeParams! + filter: WhereFilter + dimensions: [Dimension!]! + queryableGranularities: [TimeGranularity!]! +} +``` + +``` +MetricType = [SIMPLE, RATIO, CUMULATIVE, DERIVED] +``` + +**Metric Type parameters** + +```graphql +MetricTypeParams { + measure: MetricInputMeasure + inputMeasures: [MetricInputMeasure!]! + numerator: MetricInput + denominator: MetricInput + expr: String + window: MetricTimeWindow + grainToDate: TimeGranularity + metrics: [MetricInput!] +} +``` + + +**Dimension Types** + +```graphql +Dimension { + name: String! + description: String + type: DimensionType! + typeParams: DimensionTypeParams + isPartition: Boolean! + expr: String + queryableGranularities: [TimeGranularity!]! +} +``` + +``` +DimensionType = [CATEGORICAL, TIME] +``` + +### Create Query examples + +The following section provides query examples for the GraphQL API, such as how to query metrics, dimensions, where filters, and more. + +**Query two metrics grouped by time** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "food_order_amount"}] + groupBy: [{name: "metric_time}, {name: "customer__customer_type"}] + ) { + queryId + } +} +``` + +**Query with a time grain** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "order_total"}] + groupBy: [{name: "metric_time", grain: MONTH}] + ) { + queryId + } +} +``` + +Note that when using granularity in the query, the output of a time dimension with a time grain applied to it always takes the form of a dimension name appended with a double underscore and the granularity level - `{time_dimension_name}__{DAY|WEEK|MONTH|QUARTER|YEAR}`. Even if no granularity is specified, it will also always have a granularity appended to it and will default to the lowest available (usually daily for most data sources). It is encouraged to specify a granularity when using time dimensions so that there won't be any unexpected results with the output data. + +**Query two metrics with a categorical dimension** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "food_order_amount"}, {name: "order_gross_profit"}] + groupBy: [{name: "metric_time, grain: MONTH}, {name: "customer__customer_type"}] + ) { + queryId + } +} +``` + +**Query with a where filter** + +The `where` filter takes a list argument (or a string for a single input). Depending on the object you are filtering, there are a couple of parameters: + + - `Dimension()` — Used for any categorical or time dimensions. If used for a time dimension, granularity is required. For example, `Dimension('metric_time').grain('week')` or `Dimension('customer__country')`. + +- `Entity()` — Used for entities like primary and foreign keys, such as `Entity('order_id')`. + +Note: If you prefer a more strongly typed `where` clause, you can optionally use `TimeDimension()` to separate out categorical dimensions from time ones. The `TimeDimension` input takes the time dimension name and also requires granularity. For example, `TimeDimension('metric_time', 'MONTH')`. + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics:[{name: "order_total"}] + groupBy:[{name: "customer__customer_type"}, {name: "metric_time", grain: MONTH}] + where:[{sql: "{{ Dimension('customer__customer_type') }} = 'new'"}, {sql:"{{ Dimension('metric_time').grain('month') }} > '2022-10-01'"}] + ) { + queryId + } +} +``` + +**Query with Order** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "order_total"}] + groupBy: [{name: "metric_time", grain: MONTH}] + orderBy: [{metric: {name: "order_total"}}, {groupBy: {name: "metric_time", grain: MONTH}, descending:true}] + ) { + queryId + } +} +``` + + +**Query with Limit** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name:"food_order_amount"}, {name: "order_gross_profit"}] + groupBy: [{name:"metric_time, grain: MONTH}, {name: "customer__customer_type"}] + limit: 10 + ) { + queryId + } +} +``` + +**Query with Explain** + +This takes the same inputs as the `createQuery` mutation. + +```graphql +mutation { + compileSql( + environmentId: BigInt! + metrics: [{name:"food_order_amount"} {name:"order_gross_profit"}] + groupBy: [{name:"metric_time, grain: MONTH}, {name:"customer__customer_type"}] + ) { + sql + } +} +``` + +### Output format and pagination + +**Output format** + +By default, the output is in Arrow format. You can switch to JSON format using the following parameter. However, due to performance limitations, we recommend using the JSON parameter for testing and validation. The JSON received is a base64 encoded string. To access it, you can decode it using a base64 decoder. The JSON is created from pandas, which means you can change it back to a dataframe using `pandas.read_json(json, orient="table")`. Or you can work with the data directly using `json["data"]`, and find the table schema using `json["schema"]["fields"]`. Alternatively, you can pass `encoded:false` to the jsonResult field to get a raw JSON string directly. + + +```graphql +{ + query(environmentId: BigInt!, queryId: Int!, pageNum: Int! = 1) { + sql + status + error + totalPages + arrowResult + jsonResult(orient: PandasJsonOrient! = TABLE, encoded: Boolean! = true) + } +} +``` + +The results default to the table but you can change it to any [pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) supported value. + +**Pagination** + +By default, we return 1024 rows per page. If your result set exceeds this, you need to increase the page number using the `pageNum` option. + +### Run a Python query + +The `arrowResult` in the GraphQL query response is a byte dump, which isn't visually useful. You can convert this byte data into an Arrow table using any Arrow-supported language. Refer to the following Python example explaining how to query and decode the arrow result: + + +```python +import base64 +import pyarrow as pa + +headers = {"Authorization":"Bearer "} +query_result_request = """ +{ + query(environmentId: 70, queryId: "12345678") { + sql + status + error + arrowResult + } +} +""" + +gql_response = requests.post( + "https://semantic-layer.cloud.getdbt.com/api/graphql", + json={"query": query_result_request}, + headers=headers, +) + +""" +gql_response.json() => +{ + "data": { + "query": { + "sql": "SELECT\n ordered_at AS metric_time__day\n , SUM(order_total) AS order_total\nFROM semantic_layer.orders orders_src_1\nGROUP BY\n ordered_at", + "status": "SUCCESSFUL", + "error": null, + "arrowResult": "arrow-byte-data" + } + } +} +""" + +def to_arrow_table(byte_string: str) -> pa.Table: + """Get a raw base64 string and convert to an Arrow Table.""" + with pa.ipc.open_stream(base64.b64decode(res)) as reader: + return pa.Table.from_batches(reader, reader.schema) + + +arrow_table = to_arrow_table(gql_response.json()["data"]["query"]["arrowResult"]) + +# Perform whatever functionality is available, like convert to a pandas table. +print(arrow_table.to_pandas()) +""" +order_total ordered_at + 3 2023-08-07 + 112 2023-08-08 + 12 2023-08-09 + 5123 2023-08-10 +""" +``` diff --git a/website/docs/docs/dbt-cloud-apis/sl-jdbc.md b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md new file mode 100644 index 00000000000..931666dd10c --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md @@ -0,0 +1,367 @@ +--- +title: "JDBC" +id: sl-jdbc +description: "Integrate and use the JDBC API to query your metrics." +tags: [Semantic Layer, API] +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +The dbt Semantic Layer Java Database Connectivity (JDBC) API enables users to query metrics and dimensions using the JDBC protocol, while also providing standard metadata functionality. + +A JDBC driver is a software component enabling a Java application to interact with a data platform. Here's some more information about our JDBC API: + +- The Semantic Layer JDBC API utilizes the open-source JDBC driver with ArrowFlight SQL protocol. +- You can download the JDBC driver from [Maven](https://search.maven.org/remotecontent?filepath=org/apache/arrow/flight-sql-jdbc-driver/12.0.0/flight-sql-jdbc-driver-12.0.0.jar). +- The dbt Semantic Layer supports ArrowFlight SQL driver version 12.0.0 and higher. +- You can embed the driver into your application stack as needed, and you can use dbt Labs' [example project](https://github.com/dbt-labs/example-semantic-layer-clients) for reference. +- If you’re a partner or user building a homegrown application, you’ll need to install an AWS root CA to the Java Trust [documentation](https://www.amazontrust.com/repository/) (specific to Java and JDBC call). + +dbt Labs partners can use the JDBC API to build integrations in their tools with the dbt Semantic Layer + +## Using the JDBC API + +If you are a dbt user or partner with access to dbt Cloud and the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), you can [setup](/docs/use-dbt-semantic-layer/setup-sl) and test this API with data from your own instance by configuring the Semantic Layer and obtaining the right JDBC connection parameters described in this document. + +You *may* be able to use our JDBC API with tools that do not have an official integration with the dbt Semantic Layer. If the tool you use allows you to write SQL and either supports a generic JDBC driver option (such as DataGrip) or supports Dremio and uses ArrowFlightSQL driver version 12.0.0 or higher, you can access the Semantic Layer API. + +Refer to [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) for more info. + +## Authentication + +dbt Cloud authorizes requests to the dbt Semantic Layer API. You need to provide an environment ID, host, and [service account tokens](/docs/dbt-cloud-apis/service-tokens). + +## Connection parameters + +The JDBC connection requires a few different connection parameters. + +This is an example of a URL connection string and the individual components: + +``` +jdbc:arrow-flight-sql://semantic-layer.cloud.getdbt.com:443?&environmentId=202339&token=SERVICE_TOKEN +``` + +| JDBC parameter | Description | Example | +| -------------- | ----------- | ------- | +| `jdbc:arrow-flight-sql://` | The protocol for the JDBC driver. | `jdbc:arrow-flight-sql://` | +| `semantic-layer.cloud.getdbt.com` | The [access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your account's dbt Cloud region. You must always add the `semantic-layer` prefix before the access URL. | For dbt Cloud deployment hosted in North America, use `semantic-layer.cloud.getdbt.com` | +| `environmentId` | The unique identifier for the dbt production environment, you can retrieve this from the dbt Cloud URL
    when you navigate to **Environments** under **Deploy**. | If your URL ends with `.../environments/222222`, your `environmentId` is `222222`

    | +| `SERVICE_TOKEN` | dbt Cloud [service token](/docs/dbt-cloud-apis/service-tokens) with “Semantic Layer Only” and "Metadata Only" permissions. Create a new service token on the **Account Settings** page. | `token=SERVICE_TOKEN` | + +*Note — If you're testing locally on a tool like DataGrip, you may also have to provide the following variable at the end or beginning of the JDBC URL `&disableCertificateVerification=true`. + +## Querying the API for metric metadata + +The Semantic Layer JDBC API has built-in metadata calls which can provide a user with information about their metrics and dimensions. + +Refer to the following tabs for metadata commands and examples: + + + + + +Use this query to fetch all defined metrics in your dbt project: + +```bash +select * from {{ + semantic_layer.metrics() +}} +``` + + + + +Use this query to fetch all dimensions for a metric. + +Note, `metrics` is a required argument that lists one or multiple metrics in it. + +```bash +select * from {{ + semantic_layer.dimensions(metrics=['food_order_amount'])}} +``` + + + + + +Use this query to fetch dimension values for one or multiple metrics and single dimension. + +Note, `metrics` is a required argument that lists one or multiple metrics in it, and a single dimension. + +```bash +select * from {{ +semantic_layer.dimension_values(metrics=['food_order_amount'], group_by=['customer__customer_name'])}} +``` + + + + + +Use this query to fetch queryable granularities for a list of metrics. This API request allows you to only show the time granularities that make sense for the primary time dimension of the metrics (such as `metric_time`), but if you want queryable granularities for other time dimensions, you can use the `dimensions()` call, and find the column queryable_granularities. + +Note, `metrics` is a required argument that lists one or multiple metrics in it. + +```bash +select * from {{ + semantic_layer.queryable_granularities(metrics=['food_order_amount', 'order_gross_profit'])}} +``` + + + + + + + + + +Use this query to fetch available metrics given dimensions. This command is essentially the opposite of getting dimensions given a list of metrics. + +Note, `group_by` is a required argument that lists one or multiple dimensions in it. + +```bash +select * from {{ + semantic_layer.metrics_for_dimensions(group_by=['customer__customer_type']) + +}} +``` + + + + + +Use this example query to fetch available granularities for all time dimesensions (the similar queryable granularities API call only returns granularities for the primary time dimensions for metrics). The following call is a derivative of the `dimensions()` call and specifically selects the granularities field. + +```bash +select NAME, QUERYABLE_GRANULARITIES from {{ + semantic_layer.dimensions( + metrics=["order_total"] + ) +}} + +``` + + + + + +It may be useful in your application to expose the names of the time dimensions that represent `metric_time` or the common thread across all metrics. + +You can first query the `metrics()` argument to fetch a list of measures, then use the `measures()` call which will return the name(s) of the time dimensions that make up metric time. + +```bash +select * from {{ + semantic_layer.measures(metrics=['orders']) +}} +``` + + + + +## Querying the API for metric values + +To query metric values, here are the following parameters that are available: + +| Parameter | Description | Example | Type | +| --------- | -----------| ------------ | -------------------- | +| `metrics` | The metric name as defined in your dbt metric configuration | `metrics=['revenue']` | Required | +| `group_by` | Dimension names or entities to group by. We require a reference to the entity of the dimension (other than for the primary time dimension), which is pre-appended to the front of the dimension name with a double underscore. | `group_by=['user__country', 'metric_time']` | Optional | +| `grain` | A parameter specific to any time dimension and changes the grain of the data from the default for the metric. | `group_by=[Dimension('metric_time')`
    `grain('week\|day\|month\|quarter\|year')]` | Optional | +| `where` | A where clause that allows you to filter on dimensions and entities using parameters. This takes a filter list OR string. Inputs come with `Dimension`, and `Entity` objects. Granularity is required if the `Dimension` is a time dimension | `"{{ where=Dimension('customer__country') }} = 'US')"` | Optional | +| `limit` | Limit the data returned | `limit=10` | Optional | +|`order` | Order the data returned by a particular field | `order_by=['order_gross_profit']`, use `-` for descending, or full object notation if the object is operated on: `order_by=[Metric('order_gross_profit').descending(True)`] | Optional | +| `compile` | If true, returns generated SQL for the data platform but does not execute | `compile=True` | Optional | + + + +## Note on time dimensions and `metric_time` + +You will notice that in the list of dimensions for all metrics, there is a dimension called `metric_time`. `Metric_time` is a reserved keyword for the measure-specific aggregation time dimensions. For any time-series metric, the `metric_time` keyword should always be available for use in queries. This is a common dimension across *all* metrics in a semantic graph. + +You can look at a single metric or hundreds of metrics, and if you group by `metric_time`, it will always give you the correct time series. + +Additionally, when performing granularity calculations that are global (not specific to a particular time dimension), we recommend you always operate on `metric_time` and you will get the correct answer. + +Note that `metric_time` should be available in addition to any other time dimensions that are available for the metric(s). In the case where you are looking at one metric (or multiple metrics from the same data source), the values in the series for the primary time dimension and `metric_time` are equivalent. + + +## Examples + +Refer to the following examples to help you get started with the JDBC API. + +### Fetch metadata for metrics + +You can filter/add any SQL outside of the templating syntax. For example, you can use the following query to fetch the name and dimensions for a metric: + +```bash +select name, dimensions from {{ + semantic_layer.metrics() + }} + WHERE name='food_order_amount' +``` + +### Query common dimensions + +You can select common dimensions for multiple metrics. Use the following query to fetch the name and dimensions for multiple metrics: + +```bash +select * from {{ + semantic_layer.dimensions(metrics=['food_order_amount', 'order_gross_profit']) + }} +``` + +### Query grouped by time + +The following example query uses the [shorthand method](#faqs) to fetch revenue and new customers grouped by time: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount','order_gross_profit'], + group_by=['metric_time']) + }} +``` + +### Query with a time grain + +Use the following example query to fetch multiple metrics with a change in time dimension granularities: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month')]) + }} +``` + +### Group by categorical dimension + +Use the following query to group by a categorical dimension: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month'), 'customer__customer_type']) + }} +``` + +### Query with where filters + +Where filters in API allow for a filter list or string. We recommend using the filter list for production applications as this format will realize all benefits from the where possible. + +Where Filters have a few objects that you can use: + +- `Dimension()` - Used for any categorical or time dimensions. If used for a time dimension, granularity is required - `Dimension('metric_time').grain('week')` or `Dimension('customer__country')` + +- `Entity()` - Used for entities like primary and foreign keys - `Entity('order_id')` + +Note: If you prefer a more explicit path to create the `where` clause, you can optionally use the `TimeDimension` feature. This helps separate out categorical dimensions from time-related ones. The `TimeDimesion` input takes the time dimension name and also requires granularity, like this: `TimeDimension('metric_time', 'MONTH')`. + + +- Use the following example to query using a `where` filter with the string format: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], +group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], +where="{{ Dimension('metric_time').grain('month') }} >= '2017-03-09' AND {{ Dimension('customer__customer_type' }} in ('new') AND {{ Entity('order_id') }} = 10") +}} +``` + +- (Recommended for better performance) Use the following example to query using a `where` filter with a filter list format: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], +group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], +where=["{{ Dimension('metric_time').grain('month') }} >= '2017-03-09'", "{{ Dimension('customer__customer_type' }} in ('new')", "{{ Entity('order_id') }} = 10"] +}} +``` + +### Query with a limit + +Use the following example to query using a `limit` or `order_by` clauses: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10) + }} +``` +### Query with Order By Examples + +Order By can take a basic string that's a Dimension, Metric, or Entity and this will default to ascending order + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10, + order_by=['order_gross_profit'] + }} +``` + +For descending order, you can add a `-` sign in front of the object. However, you can only use this short hand notation if you aren't operating on the object or using the full object notation. + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10, + order_by=[-'order_gross_profit'] + }} +``` +If you are ordering by an object that's been operated on (e.g., change granularity), or you are using the full object notation, descending order must look like: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('week')], + limit=10, + order_by=[Metric('order_gross_profit').descending(True), Dimension('metric_time').grain('week').descending(True) ] + }} +``` + +Similarly, this will yield ascending order: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('week')], + limit=10, + order_by=[Metric('order_gross_profit'), Dimension('metric_time').grain('week')] + }} +``` + + +### Query with compile keyword + +Use the following example to query using a `compile` keyword: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], + compile=True) + }} +``` + +## FAQs + +- **Why do some dimensions use different syntax, like `metric_time` versus `[Dimension('metric_time')`?**
    + When you select a dimension on its own, such as `metric_time` you can use the shorthand method which doesn't need the “Dimension” syntax. However, when you perform operations on the dimension, such as adding granularity, the object syntax `[Dimension('metric_time')` is required. + +- **What does the double underscore `"__"` syntax in dimensions mean?**
    + The double underscore `"__"` syntax indicates a mapping from an entity to a dimension, as well as where the dimension is located. For example, `user__country` means someone is looking at the `country` dimension from the `user` table. + +- **What is the default output when adding granularity?**
    + The default output follows the format `{time_dimension_name}__{granularity_level}`. So for example, if the time dimension name is `ds` and the granularity level is yearly, the output is `ds__year`. + +## Related docs + +- [dbt Semantic Layer integration best practices](/guides/sl-partner-integration-guide) + diff --git a/website/docs/docs/dbt-cloud-apis/sl-manifest.md b/website/docs/docs/dbt-cloud-apis/sl-manifest.md new file mode 100644 index 00000000000..6ecac495869 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-manifest.md @@ -0,0 +1,100 @@ +--- +title: "Semantic manifest" +id: sl-manifest +description: "Learn about the semantic manifest.json file and how you can use artifacts to gain insights about your dbt Semantic Layer." +tags: [Semantic Layer, APIs] +sidebar_label: "Semantic manifest" +pagination_next: null +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +dbt creates an [artifact](/reference/artifacts/dbt-artifacts) file called the _Semantic Manifest_ (`semantic_manifest.json`), which MetricFlow requires to build and run metric queries properly for the dbt Semantic Layer. This artifact contains comprehensive information about your dbt Semantic Layer. It is an internal file that acts as the integration point with MetricFlow. + +By using the semantic manifest produced by dbt Core, MetricFlow will instantiate a data flow plan and generate SQL from Semantic Layer query requests. It's a valuable reference that you can use to understand the structure and details of your data models. + +Similar to the [`manifest.json` file](/reference/artifacts/manifest-json), the `semantic_manifest.json` also lives in the `/target` directory of your dbt project. This is where dbt stores various artifacts (such as compiled models and tests) generated during the execution of your project. + +## How it's produced + +The `semantic_manifest.json` is produced whenever your dbt project is parsed. The easiest way to generate the file yourself is to run `dbt parse`. Since `dbt run`, `dbt build`, and `dbt compile` all parse your dbt project, these commands will generate a semantic manifest as well. + + +## Top level keys + +Top-level keys for the semantic manifest are: +- `semantic_models` — Starting points of data with entities, dimensions, and measures, and correspond to models in your dbt project. +- `metrics` — Functions combining measures, constraints, and so on to define quantitative indicators. +- `project_configuration` — Contains information around your project configurations + +
    +Example target/semantic_manifest.json file + +```json +{ + "semantic_models": [ + { + "name": "semantic model name", + "defaults": null, + "description": "semantic model description", + "node_relation": { + "alias": "model alias", + "schema_name": "model schema", + "database": "model db", + "relation_name": "Fully qualified relation name" + }, + "entities": ["entities in the semantic model"], + "measures": ["measures in the semantic model"], + "dimensions": ["dimensions in the semantic model" ], + "metrics": [ + { + "name": "name of the metric", + "description": "metric description", + "type": "metric type", + "type_params": { + "measure": { + "name": "name for measure", + "filter": "filter for measure", + "alias": "alias for measure" + }, + "numerator": null, + "denominator": null, + "expr": null, + "window": null, + "grain_to_date": null, + "metrics": ["metrics used in defining the metric. this is used in derived metrics"], + "input_measures": [] + }, + "filter": null, + "metadata": null + } + ], + "project_configuration": { + "time_spine_table_configurations": [ + { + "location": "fully qualified table name for timespine", + "column_name": "date column", + "grain": "day" + } + ], + "metadata": null, + "dsi_package_version": {} + } +} + ] +} +``` + +
    + +## Related docs + +- [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) +- [About dbt artifacts](/reference/artifacts/dbt-artifacts) + diff --git a/website/docs/docs/dbt-cloud-apis/user-tokens.md b/website/docs/docs/dbt-cloud-apis/user-tokens.md index e56d8b2f974..77e536b12a5 100644 --- a/website/docs/docs/dbt-cloud-apis/user-tokens.md +++ b/website/docs/docs/dbt-cloud-apis/user-tokens.md @@ -1,6 +1,7 @@ --- title: "User tokens" id: "user-tokens" +pagination_next: "docs/dbt-cloud-apis/service-tokens" --- ## User API tokens @@ -13,7 +14,7 @@ permissions of the user the that they were created for. You can find your User API token in the Profile page under the `API Access` label. - + ## FAQs diff --git a/website/docs/docs/dbt-cloud-environments.md b/website/docs/docs/dbt-cloud-environments.md index f61ec5ef72b..522a354be97 100644 --- a/website/docs/docs/dbt-cloud-environments.md +++ b/website/docs/docs/dbt-cloud-environments.md @@ -2,9 +2,10 @@ title: "dbt Cloud environments" id: "dbt-cloud-environments" description: "Learn about dbt Cloud's development environment to execute your project in the IDE" +pagination_next: null --- -An environment determines how dbt Cloud will execute your project in both the dbt Cloud IDE (for development) and scheduled jobs (for deployment). +An environment determines how dbt Cloud will execute your project in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) (for development) and scheduled jobs (for deployment). Critically, in order to execute dbt, environments define three variables: @@ -34,7 +35,7 @@ To create a new dbt Cloud development environment: ### Set developer credentials -To use the IDE, each developer will need to set up [personal development credentials](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#access-the-cloud-ide) to your warehouse connection in their **Profile Settings**. This allows you to set separate target information and maintain individual credentials to connect to your warehouse via the dbt Cloud IDE. +To use the dbt Cloud IDE or dbt Cloud CLI, each developer will need to set up [personal development credentials](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#access-the-cloud-ide) to your warehouse connection in their **Profile Settings**. This allows you to set separate target information and maintain individual credentials to connect to your warehouse. @@ -44,4 +45,4 @@ To use the IDE, each developer will need to set up [personal development credent Deployment environments in dbt Cloud are necessary to execute scheduled jobs and use other features. A dbt Cloud project can have multiple deployment environments, allowing for flexibility and customization. However, a dbt Cloud project can only have one deployment environment that represents the production source of truth. -To learn more about dbt Cloud deployment environments and how to configure them, visit the [Deployment environments](/docs/deploy/deploy-environments) page. For our best practices guide, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) for more info. +To learn more about dbt Cloud deployment environments and how to configure them, refer to the [Deployment environments](/docs/deploy/deploy-environments) page. For our best practices guide, read [dbt Cloud environment best practices](/guides/set-up-ci) for more info. diff --git a/website/docs/docs/dbt-support.md b/website/docs/docs/dbt-support.md index a6e9262200c..40968b9d763 100644 --- a/website/docs/docs/dbt-support.md +++ b/website/docs/docs/dbt-support.md @@ -1,36 +1,66 @@ --- title: "dbt support" id: "dbt-support" +pagination_next: null +pagination_prev: null --- +Support for dbt is available to all users through the following channels: + +- Dedicated dbt Support team (dbt Cloud users). +- [The Community Forum](https://discourse.getdbt.com/). +- [dbt Community slack](https://www.getdbt.com/community/join-the-community/). + ## dbt Core support -If you're developing in the command line (CLI) and have questions or need some help — reach out to the helpful dbt community through [the Community Forum](https://discourse.getdbt.com/) or [dbt Community slack](https://www.getdbt.com/community/join-the-community/). +If you're developing on the command line (CLI) and have questions or need some help — reach out to the helpful dbt community through [the Community Forum](https://discourse.getdbt.com/) or [dbt Community slack](https://www.getdbt.com/community/join-the-community/). ## dbt Cloud support -We want to help you work through implementing and utilizing dbt Cloud at your organization. Have a question you can't find an answer to in [our docs](https://docs.getdbt.com/) or [the Community Forum](https://discourse.getdbt.com/)? Our Support team is here to `dbt help` you! -Check out our guide on [getting help](/community/resources/getting-help) - half of the problem is often knowing where to look... and how to ask good questions! +The global dbt Support team is available to dbt Cloud customers by email or in-product live chat. We want to help you work through implementing and utilizing dbt Cloud at your organization. Have a question you can't find an answer to in [our docs](https://docs.getdbt.com/) or [the Community Forum](https://discourse.getdbt.com/)? Our Support team is here to `dbt help` you! + +- **Enterprise plans** — Priority [support](#severity-level-for-enterprise-support), options for custom support coverage hours, implementation assistance, dedicated management, and dbt Labs security reviews depending on price point. +- **Developer and Team plans** — 24x5 support (no service level agreement (SLA); [contact Sales](https://www.getdbt.com/pricing/) for Enterprise plan inquiries). +- **Support team help** — Assistance with dbt Cloud questions, like project setup, login issues, error understanding, setup private packages, link to a new GitHub account, and so on. +- **Resource guide** — Check the [guide](/community/resources/getting-help) for effective help-seeking strategies. + +
    +Example of common support questions +Types of dbt Cloud-related questions our Support team can assist you with, regardless of your dbt Cloud plan:

    +How do I...
    + - set up a dbt Cloud project?
    + - set up a private package in dbt Cloud?
    + - configure custom branches on git repos?
    + - link dbt to a new GitHub account?

    +Help! I can't...
    + - log in.
    + - access logs.
    + - update user groups.

    +I need help understanding...
    + - why this run failed.
    + - why I am getting this error message in dbt Cloud?
    + - why my CI jobs are not kicking off as expected.
    +
    -Types of dbt Cloud-related questions our Support team can assist you with, regardless of your dbt Cloud plan: -- **How do I...** - - set up a dbt Cloud project? - - set up a private package in dbt Cloud? - - configure custom branches on git repos? - - link dbt to a new github account? -- **Help! I can't...** - - log in. - - access logs. - - update user groups. -- **I need help understanding...** - - why this run failed. - - why I am getting this error message in dbt Cloud. - - why my CI jobs are not kicking off as expected. + +## dbt Cloud Enterprise accounts -### dbt Cloud Enterprise accounts +Basic assistance with dbt project troubleshooting. +Help with errors and issues in macros, models, and dbt Labs' packages. +For strategic advice, expansion, and project setup, consult Solutions Architect and Sales Director. -For customers on a dbt Cloud Enterprise plan, we **also** offer basic assistance in troubleshooting issues with your dbt project. +For customers on a dbt Cloud Enterprise plan, we **also** offer basic assistance in troubleshooting issues with your dbt project: - **Something isn't working the way I would expect it to...** - in a macro I created... - in an incremental model I'm building... @@ -48,5 +78,20 @@ Types of questions you should ask your Solutions Architect and Sales Director: - Here is our data road map for the next year - can we talk through how dbt fits into it and what features we may not be utilizing that can help us achieve our goals? - It is time for our contract renewal, what options do I have? +### Severity level for Enterprise support + +Support tickets are assigned a severity level based on the impact of the issue on your business. The severity level is assigned by dbt Labs, and the level assigned determines the priority level of support you will receive. For specific ticket response time or other questions that relate to your Enterprise account’s SLA, please refer to your Enterprise contract. + +| Severity Level | Description | +| -------------- | ----------- | +| Severity Level 1 | Any Error which makes the use or continued use of the Subscription or material features impossible; Subscription is not operational, with no alternative available. | +| Severity Level 2 | Feature failure, without a workaround, but Subscription is operational. | +| Severity Level 3 | Feature failure, but a workaround exists. | +| Severity Level 4 | Error with low-to-no impact on Client’s access to or use of the Subscription, or Client has a general question or feature enhancement request. | + +## External help -When you need help writing SQL, reviewing the overall performance of your project, or want someone to actually help build your dbt project, check out our list of [dbt Preferred Consulting Providers](https://www.getdbt.com/ecosystem/) or our [Services](https://www.getdbt.com/dbt-labs/services/) page! +For SQL writing, project performance review, or project building, refer to dbt Preferred Consulting Providers and dbt Labs' Services. +For help writing SQL, reviewing the overall performance of your project, or want someone to actually help build your dbt project, refer to the following pages: +- List of [dbt Preferred Consulting Providers](https://www.getdbt.com/ecosystem/). +- dbt Labs' [Services](https://www.getdbt.com/dbt-labs/services/). diff --git a/website/docs/docs/dbt-versions/core-upgrade/00-upgrading-to-v1.7.md b/website/docs/docs/dbt-versions/core-upgrade/00-upgrading-to-v1.7.md new file mode 100644 index 00000000000..9ebd3c64cf3 --- /dev/null +++ b/website/docs/docs/dbt-versions/core-upgrade/00-upgrading-to-v1.7.md @@ -0,0 +1,70 @@ +--- +title: "Upgrading to v1.7 (latest)" +id: upgrading-to-v1.7 +description: New features and changes in dbt Core v1.7 +displayed_sidebar: "docs" +--- + +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + +## Resources + +- [Changelog](https://github.com/dbt-labs/dbt-core/blob/8aaed0e29f9560bc53d9d3e88325a9597318e375/CHANGELOG.md) +- [CLI Installation guide](/docs/core/installation) +- [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) +- [Release schedule](https://github.com/dbt-labs/dbt-core/issues/8260) + +## What to know before upgrading + +dbt Labs is committed to providing backward compatibility for all versions 1.x, with the exception of any changes explicitly mentioned below. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). + +### Behavior changes + +dbt Core v1.7 expands the amount of sources you can configure freshness for. Previously, freshness was limited to sources with a `loaded_at_field`; now, freshness can be generated from warehouse metadata tables when available. + +As part of this change, the `loaded_at_field` is no longer required to generate source freshness. If a source has a `freshness:` block, dbt will attempt to calculate freshness for that source: +- If a `loaded_at_field` is provided, dbt will calculate freshness via a select query (previous behavior). +- If a `loaded_at_field` is _not_ provided, dbt will calculate freshness via warehouse metadata tables when possible (new behavior). + +This is a relatively small behavior change, but worth calling out in case you notice that dbt is calculating freshness for _more_ sources than before. To exclude a source from freshness calculations, you have two options: +- Don't add a `freshness:` block. +- Explicitly set `freshness: null` + +## New and changed features and functionality + +- [`dbt docs generate`](/reference/commands/cmd-docs) now supports `--select` to generate [catalog metadata](/reference/artifacts/catalog-json) for a subset of your project. Currently available for Snowflake and Postgres only, but other adapters are coming soon. +- [Source freshness](/docs/deploy/source-freshness) can now be generated from warehouse metadata tables, currently Snowflake only, but other adapters that have metadata tables are coming soon. + +### MetricFlow enhancements + +- Automatically create metrics on measures with [`create_metric: true`](/docs/build/semantic-models). +- Optional [`label`](/docs/build/semantic-models) in semantic_models, measures, dimensions and entities. +- New configurations for semantic models - [enable/disable](/reference/resource-configs/enabled), [group](/reference/resource-configs/group), and [meta](/reference/resource-configs/meta). +- Support `fill_nulls_with` and `join_to_timespine` for metric nodes. +- `saved_queries` extends governance beyond the semantic objects to their consumption. + +### For consumers of dbt artifacts (metadata) + +- The [manifest](/reference/artifacts/manifest-json) schema version has been updated to v11. +- The [run_results](/reference/artifacts/run-results-json) schema version has been updated to v5. +- There are a few specific changes to the [catalog.json](/reference/artifacts/catalog-json): + - Added [node attributes](/reference/artifacts/run-results-json) related to compilation (`compiled`, `compiled_code`, `relation_name`) to the `catalog.json`. + - The nodes dictionary in the `catalog.json` can now be "partial" if `dbt docs generate` is run with a selector. + +### Model governance + +dbt Core v1.5 introduced model governance which we're continuing to refine. v1.7 includes these additional features and functionality: + +- **[Breaking change detection](/reference/resource-properties/versions#detecting-breaking-changes) for models with contracts enforced:** When dbt detects a breaking change to a model with an enforced contract during state comparison, it will now raise an error for versioned models and a warning for models that are not versioned. +- **[Set `access` as a config](/reference/resource-configs/access):** You can now set a model's `access` within config blocks in the model's file or in the `dbt_project.yml` for an entire subfolder at once. +- **[Type aliasing for model contracts](/reference/resource-configs/contract):** dbt will use each adapter's built-in type aliasing for user-provided data types—meaning you can now write `string` always, and dbt will translate to `text` on Postgres/Redshift. This is "on" by default, but you can opt-out. +- **[Raise warning for numeric types](/reference/resource-configs/contract):** Because of issues when putting `numeric` in model contracts without considering that default values such as `numeric(38,0)` might round decimals accordingly. dbt will now warn you if it finds a numeric type without specified precision/scale. + +### Quick hits + +With these quick hits, you can now: +- Configure a [`delimiter`](/reference/resource-configs/delimiter) for a seed file. +- Use packages with the same git repo and unique subdirectory. +- Access the `date_spine` macro directly from dbt-core (moved over from dbt-utils). diff --git a/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md b/website/docs/docs/dbt-versions/core-upgrade/01-upgrading-to-v1.6.md similarity index 68% rename from website/docs/guides/migration/versions/01-upgrading-to-v1.6.md rename to website/docs/docs/dbt-versions/core-upgrade/01-upgrading-to-v1.6.md index ac3d7348ef9..d36cc544814 100644 --- a/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md +++ b/website/docs/docs/dbt-versions/core-upgrade/01-upgrading-to-v1.6.md @@ -1,8 +1,19 @@ --- -title: "Upgrading to v1.6 (latest)" +title: "Upgrading to v1.6" description: New features and changes in dbt Core v1.6 +id: "upgrading-to-v1.6" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + +dbt Core v1.6 has three significant areas of focus: +1. Next milestone of [multi-project deployments](https://github.com/dbt-labs/dbt-core/discussions/6725): improvements to contracts, groups/access, versions; and building blocks for cross-project `ref` +1. Semantic layer re-launch: dbt Core and [MetricFlow](https://docs.getdbt.com/docs/build/about-metricflow) integration +1. Mechanisms to support mature deployment at scale (`dbt clone` and `dbt retry`) + ## Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.6.latest/CHANGELOG.md) @@ -16,13 +27,22 @@ dbt Labs is committed to providing backward compatibility for all versions 1.x, ### Behavior changes +:::info Action required if your project defines `metrics` + +The [spec for metrics](https://github.com/dbt-labs/dbt-core/discussions/7456) has changed and now uses [MetricFlow](/docs/build/about-metricflow). + +::: + +If your dbt project defines metrics, you must migrate to dbt v1.6 because the YAML spec has moved from dbt_metrics to MetricFlow. Any tests you have won't compile on v1.5 or older. + - dbt Core v1.6 does not support Python 3.7, which reached End Of Life on June 23. Support Python versions are 3.8, 3.9, 3.10, and 3.11. -- As part of the Semantic layer re-launch (in beta), the spec for `metrics` has changed significantly. Migration guide coming soon: https://github.com/dbt-labs/docs.getdbt.com/pull/3705 +- As part of the [dbt Semantic layer](/docs/use-dbt-semantic-layer/dbt-sl) re-launch (in beta), the spec for `metrics` has changed significantly. Refer to the [migration guide](/guides/sl-migration) for more info on how to migrate to the re-launched dbt Semantic Layer. - The manifest schema version is now v10. +- dbt Labs is ending support for Homebrew installation of dbt-core and adapters. See [the discussion](https://github.com/dbt-labs/dbt-core/discussions/8277) for more details. ### For consumers of dbt artifacts (metadata) -The [manifest](/reference/artifacts/manifest-json) schema version has updated to `v10`. Specific changes: +The [manifest](/reference/artifacts/manifest-json) schema version has been updated to `v10`. Specific changes: - Addition of `semantic_models` and changes to `metrics` attributes - Addition of `deprecation_date` as a model property - Addition of `on_configuration_change` as default node configuration (to support materialized views) @@ -35,14 +55,19 @@ For more detailed information and to ask questions, please read and comment on t ## New and changed documentation +### MetricFlow + +- [**Build your metrics**](/docs/build/build-metrics-intro) with MetricFlow, a key component of the dbt Semantic Layer. You can define your metrics and build semantic models with MetricFlow, available on the command line (CLI) for dbt Core v1.6 beta or higher. + ### Materialized views Supported on: - [Postgres](/reference/resource-configs/postgres-configs#materialized-view) - [Redshift](/reference/resource-configs/redshift-configs#materialized-view) -- Snowflake (docs forthcoming) +- [Snowflake](/reference/resource-configs/snowflake-configs#dynamic-tables) +- [Databricks](/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables) -Support for BigQuery and Databricks forthcoming. +Support for BigQuery coming soon. ### New commands for mature deployment @@ -71,3 +96,5 @@ More consistency and flexibility around packages. Resources defined in a package - [`dbt debug --connection`](/reference/commands/debug) to test just the data platform connection specified in a profile - [`dbt docs generate --empty-catalog`](/reference/commands/cmd-docs) to skip catalog population while generating docs - [`--defer-state`](/reference/node-selection/defer) enables more-granular control +- [`dbt ls`](/reference/commands/list) adds the Semantic model selection method to allow for `dbt ls -s "semantic_model:*"` and the ability to execute `dbt ls --resource-type semantic_model`. + diff --git a/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md b/website/docs/docs/dbt-versions/core-upgrade/02-upgrading-to-v1.5.md similarity index 78% rename from website/docs/guides/migration/versions/02-upgrading-to-v1.5.md rename to website/docs/docs/dbt-versions/core-upgrade/02-upgrading-to-v1.5.md index 5283070217c..dded8a690fe 100644 --- a/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md +++ b/website/docs/docs/dbt-versions/core-upgrade/02-upgrading-to-v1.5.md @@ -1,8 +1,14 @@ --- title: "Upgrading to v1.5" description: New features and changes in dbt Core v1.5 +id: "upgrading-to-v1.5" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + dbt Core v1.5 is a feature release, with two significant additions: 1. [**Model governance**](/docs/collaborate/govern/about-model-governance) — access, contracts, versions — the first phase of [multi-project deployments](https://github.com/dbt-labs/dbt-core/discussions/6725) 2. A Python entry point for [**programmatic invocations**](/reference/programmatic-invocations), at parity with the CLI @@ -56,7 +62,50 @@ models: tests: [] # todo! add tests later config: ... ``` -Some options that could previously be specified before a sub-command can now only be specified afterward. For example, `dbt --profiles-dir . run` isn't valid anymore, and instead, you need to use `dbt run --profiles-dir .` + +Some options that could previously be specified _after_ a subcommand can now only be specified _before_. This includes the inverse of the option, `--write-json` and `--no-write-json`, for example. The list of affected options are: + +
    +List of affected options + +```bash +--cache-selected-only | --no-cache-selected-only +--debug, -d | --no-debug +--deprecated-print | --deprecated-no-print +--enable-legacy-logger | --no-enable-legacy-logger +--fail-fast, -x | --no-fail-fast +--log-cache-events | --no-log-cache-events +--log-format +--log-format-file +--log-level +--log-level-file +--log-path +--macro-debugging | --no-macro-debugging +--partial-parse | --no-partial-parse +--partial-parse-file-path +--populate-cache | --no-populate-cache +--print | --no-print +--printer-width +--quiet, -q | --no-quiet +--record-timing-info, -r +--send-anonymous-usage-stats | --no-send-anonymous-usage-stats +--single-threaded | --no-single-threaded +--static-parser | --no-static-parser +--use-colors | --no-use-colors +--use-colors-file | --no-use-colors-file +--use-experimental-parser | --no-use-experimental-parser +--version, -V, -v +--version-check | --no-version-check +--warn-error +--warn-error-options +--write-json | --no-write-json + +``` + +
    + + +Additionally, some options that could be previously specified _before_ a subcommand can now only be specified _after_. Any option _not_ in the above list must appear _after_ the subcommand from v1.5 and later. For example, `--profiles-dir`. The built-in [collect_freshness](https://github.com/dbt-labs/dbt-core/blob/1.5.latest/core/dbt/include/global_project/macros/adapters/freshness.sql) macro now returns the entire `response` object, instead of just the `table` result. If you're using a custom override for `collect_freshness`, make sure you're also returning the `response` object; otherwise, some of your dbt commands will never finish. For example: @@ -105,4 +154,4 @@ Run `dbt --help` to see new & improved help documentation :) - The [`version: 2` top-level key](/reference/project-configs/version) is now **optional** in all YAML files. Also, the [`config-version: 2`](/reference/project-configs/config-version) and `version:` top-level keys are now optional in `dbt_project.yml` files. - [Events and logging](/reference/events-logging): Added `node_relation` (`database`, `schema`, `identifier`) to the `node_info` dictionary, available on node-specific events - Support setting `--project-dir` via environment variable: [`DBT_PROJECT_DIR`](/reference/dbt_project.yml) -- More granular [configurations](/reference/global-configs/about-global-configs) for logging (to set log format, log levels, and colorization) and cache population +- More granular configurations for logging (to set [log format](/reference/global-configs/logs#log-formatting), [log levels](/reference/global-configs/logs#log-level), and [colorization](/reference/global-configs/logs#color)) and [cache population](/reference/global-configs/cache#cache-population) diff --git a/website/docs/guides/migration/versions/03-upgrading-to-dbt-utils-v1.0.md b/website/docs/docs/dbt-versions/core-upgrade/03-upgrading-to-dbt-utils-v1.0.md similarity index 99% rename from website/docs/guides/migration/versions/03-upgrading-to-dbt-utils-v1.0.md rename to website/docs/docs/dbt-versions/core-upgrade/03-upgrading-to-dbt-utils-v1.0.md index 72c6fc3c968..a7b302c9a58 100644 --- a/website/docs/guides/migration/versions/03-upgrading-to-dbt-utils-v1.0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/03-upgrading-to-dbt-utils-v1.0.md @@ -3,6 +3,10 @@ title: "Upgrading to dbt utils v1.0" description: New features and breaking changes to consider as you upgrade to dbt utils v1.0. --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + # Upgrading to dbt utils v1.0 For the first time, [dbt utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) is crossing the major version boundary. From [last month’s blog post](https://www.getdbt.com/blog/announcing-dbt-v1.3-and-utils/): diff --git a/website/docs/guides/migration/versions/04-upgrading-to-v1.4.md b/website/docs/docs/dbt-versions/core-upgrade/04-upgrading-to-v1.4.md similarity index 97% rename from website/docs/guides/migration/versions/04-upgrading-to-v1.4.md rename to website/docs/docs/dbt-versions/core-upgrade/04-upgrading-to-v1.4.md index 3537eb1677a..6c6d96b2326 100644 --- a/website/docs/guides/migration/versions/04-upgrading-to-v1.4.md +++ b/website/docs/docs/dbt-versions/core-upgrade/04-upgrading-to-v1.4.md @@ -1,7 +1,14 @@ --- title: "Upgrading to v1.4" description: New features and changes in dbt Core v1.4 +id: "upgrading-to-v1.4" +displayed_sidebar: "docs" --- + +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.4.latest/CHANGELOG.md) diff --git a/website/docs/guides/migration/versions/05-upgrading-to-v1.3.md b/website/docs/docs/dbt-versions/core-upgrade/05-upgrading-to-v1.3.md similarity index 97% rename from website/docs/guides/migration/versions/05-upgrading-to-v1.3.md rename to website/docs/docs/dbt-versions/core-upgrade/05-upgrading-to-v1.3.md index 5fdf559a267..f66d9bb9706 100644 --- a/website/docs/guides/migration/versions/05-upgrading-to-v1.3.md +++ b/website/docs/docs/dbt-versions/core-upgrade/05-upgrading-to-v1.3.md @@ -1,7 +1,14 @@ --- title: "Upgrading to v1.3" description: New features and changes in dbt Core v1.3 +id: "upgrading-to-v1.3" +displayed_sidebar: "docs" --- + +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.3.latest/CHANGELOG.md) diff --git a/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md b/website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.2.md similarity index 96% rename from website/docs/guides/migration/versions/06-upgrading-to-v1.2.md rename to website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.2.md index 91ffadf9093..16825ff4e2b 100644 --- a/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md +++ b/website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.2.md @@ -1,7 +1,14 @@ --- title: "Upgrading to v1.2" description: New features and changes in dbt Core v1.2 +id: "upgrading-to-v1.2" +displayed_sidebar: "docs" --- + +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.2.latest/CHANGELOG.md) diff --git a/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md b/website/docs/docs/dbt-versions/core-upgrade/07-upgrading-to-v1.1.md similarity index 94% rename from website/docs/guides/migration/versions/07-upgrading-to-v1.1.md rename to website/docs/docs/dbt-versions/core-upgrade/07-upgrading-to-v1.1.md index 131ecc97657..403264a46e6 100644 --- a/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md +++ b/website/docs/docs/dbt-versions/core-upgrade/07-upgrading-to-v1.1.md @@ -1,7 +1,14 @@ --- title: "Upgrading to v1.1" description: New features and changes in dbt Core v1.1 +id: "upgrading-to-v1.1" +displayed_sidebar: "docs" --- + +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.1.latest/CHANGELOG.md) @@ -14,7 +21,7 @@ There are no breaking changes for code in dbt projects and packages. We are comm ### For maintainers of adapter plugins -We have reworked the testing suite for adapter plugin functionality. For details on the new testing suite, see: [Testing a new adapter](/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter). +We have reworked the testing suite for adapter plugin functionality. For details on the new testing suite, refer to the "Test your adapter" step in the [Build, test, document, and promote adapters](/guides/adapter-creation) guide. The abstract methods `get_response` and `execute` now only return `connection.AdapterReponse` in type hints. Previously, they could return a string. We encourage you to update your methods to return an object of class `AdapterResponse`, or implement a subclass specific to your adapter. This also gives you the opportunity to add fields specific to your adapter's query execution, such as `rows_affected` or `bytes_processed`. diff --git a/website/docs/guides/migration/versions/08-upgrading-to-v1.0.md b/website/docs/docs/dbt-versions/core-upgrade/08-upgrading-to-v1.0.md similarity index 90% rename from website/docs/guides/migration/versions/08-upgrading-to-v1.0.md rename to website/docs/docs/dbt-versions/core-upgrade/08-upgrading-to-v1.0.md index 9fc7991c087..3f45e44076c 100644 --- a/website/docs/guides/migration/versions/08-upgrading-to-v1.0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/08-upgrading-to-v1.0.md @@ -1,7 +1,14 @@ --- title: "Upgrading to v1.0" description: New features and changes in dbt Core v1.0 +id: "upgrading-to-v1.0" +displayed_sidebar: "docs" --- + +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Discourse](https://discourse.getdbt.com/t/3180) @@ -44,7 +51,7 @@ Global project macros have been reorganized, and some old unused macros have bee ### For users of adapter plugins -- **BigQuery:** Support for [ingestion-time-partitioned tables](/guides/legacy/creating-date-partitioned-tables) has been officially deprecated in favor of modern approaches. Use `partition_by` and incremental modeling strategies instead. +- **BigQuery:** Support for ingestion-time-partitioned tables has been officially deprecated in favor of modern approaches. Use `partition_by` and incremental modeling strategies instead. For more information, refer to [Incremental models](/docs/build/incremental-models). ### For maintainers of plugins + other integrations @@ -64,9 +71,9 @@ Several under-the-hood changes from past minor versions, tagged with deprecation ## New features and changed documentation - Add [metrics](/docs/build/metrics), a new node type -- [Generic tests](/guides/best-practices/writing-custom-generic-tests) can be defined in `tests/generic` (new), in addition to `macros/` (as before) +- [Generic tests](/best-practices/writing-custom-generic-tests) can be defined in `tests/generic` (new), in addition to `macros/` (as before) - [Parsing](/reference/parsing): partial parsing and static parsing have been turned on by default. - [Global configs](/reference/global-configs/about-global-configs) have been standardized. Related updates to [global CLI flags](/reference/global-cli-flags) and [`profiles.yml`](/docs/core/connect-data-platform/profiles.yml). - [The `init` command](/reference/commands/init) has a whole new look and feel. It's no longer just for first-time users. -- Add `result:` subselectors for smarter reruns when dbt models have errors and tests fail. See examples: [Pro-tips for Workflows](/guides/legacy/best-practices#pro-tips-for-workflows) +- Add `result:` subselectors for smarter reruns when dbt models have errors and tests fail. See examples: [Pro-tips for Workflows](/best-practices/best-practice-workflows#pro-tips-for-workflows) - Secret-prefixed [env vars](/reference/dbt-jinja-functions/env_var) are now allowed only in `profiles.yml` + `packages.yml` diff --git a/website/docs/guides/migration/versions/09-upgrading-to-v0.21.md b/website/docs/docs/dbt-versions/core-upgrade/09-upgrading-to-v0.21.md similarity index 97% rename from website/docs/guides/migration/versions/09-upgrading-to-v0.21.md rename to website/docs/docs/dbt-versions/core-upgrade/09-upgrading-to-v0.21.md index e5fbdf3fc7c..d5b429132cd 100644 --- a/website/docs/guides/migration/versions/09-upgrading-to-v0.21.md +++ b/website/docs/docs/dbt-versions/core-upgrade/09-upgrading-to-v0.21.md @@ -1,8 +1,15 @@ --- title: "Upgrading to v0.21" +id: "upgrading-to-v0.21" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + + :::caution Unsupported version dbt Core v0.21 has reached the end of critical support. No new patch versions will be released, and it will stop running in dbt Cloud on June 30, 2022. Read ["About dbt Core versions"](/docs/dbt-versions/core) for more details. ::: diff --git a/website/docs/guides/migration/versions/10-upgrading-to-v0.20.md b/website/docs/docs/dbt-versions/core-upgrade/10-upgrading-to-v0.20.md similarity index 94% rename from website/docs/guides/migration/versions/10-upgrading-to-v0.20.md rename to website/docs/docs/dbt-versions/core-upgrade/10-upgrading-to-v0.20.md index 8b33bfa3879..9ff5695d5dc 100644 --- a/website/docs/guides/migration/versions/10-upgrading-to-v0.20.md +++ b/website/docs/docs/dbt-versions/core-upgrade/10-upgrading-to-v0.20.md @@ -1,8 +1,13 @@ --- title: "Upgrading to v0.20" - +id: "upgrading-to-v0.20" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + :::caution Unsupported version dbt Core v0.20 has reached the end of critical support. No new patch versions will be released, and it will stop running in dbt Cloud on June 30, 2022. Read ["About dbt Core versions"](/docs/dbt-versions/core) for more details. ::: @@ -28,7 +33,7 @@ dbt Core v0.20 has reached the end of critical support. No new patch versions wi - [Test Configs](/reference/test-configs) - [Test properties](/reference/resource-properties/tests) - [Node Selection](/reference/node-selection/syntax) (with updated [test selection examples](/reference/node-selection/test-selection-examples)) -- [Writing custom generic tests](/guides/best-practices/writing-custom-generic-tests) +- [Writing custom generic tests](/best-practices/writing-custom-generic-tests) ### Elsewhere in Core - [Parsing](/reference/parsing): rework of partial parsing, introduction of experimental parser diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-11-0.md similarity index 95% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-11-0.md index e307c46fdf9..e91dde4c923 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-11-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.11.0" id: "upgrading-to-0-11-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ## Schema.yml v2 syntax dbt v0.11.0 adds an auto-generated docs site to your dbt project. To make effective use of the documentation site, you'll need to use the new "version 2" schema.yml syntax. For a full explanation of the version 2 syntax, check out the [schema.yml Files](/reference/configs-and-properties) section of the documentation. diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-12-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-12-0.md similarity index 76% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-12-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-12-0.md index 60900d3c1a4..b3d4e9d9bcb 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-12-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-12-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.12.0" id: "upgrading-to-0-12-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ## End of support Support for the `repositories:` block in `dbt_project.yml` (deprecated in 0.10.0) was removed. diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-13-0.md similarity index 94% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-13-0.md index 14a70e177e8..bb15d1a73b0 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-13-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.13.0" id: "upgrading-to-0-13-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ## Breaking changes ### on-run-start and on-run-end diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-14-0.md similarity index 99% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-14-0.md index 3b9c8560230..036a9a2aedf 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-14-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.14.0" id: "upgrading-to-0-14-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + This guide outlines migration instructions for: 1. [Upgrading archives to snapshots](#upgrading-to-snapshot-blocks) diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-14-1.md similarity index 98% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-14-1.md index a81740d5a68..215385acf0f 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-14-1.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.14.1" id: "upgrading-to-0-14-1" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + The dbt v0.14.1 release _does not_ contain any breaking code changes for users upgrading from v0.14.0. If you are upgrading from a version less than 0.14.0, consult the [Upgrading to 0.14.0](upgrading-to-0-14-0) migration guide. The following section contains important information for users of the `check` strategy on Snowflake and BigQuery. Action may be required in your database. ## Changes to the Snapshot "check" algorithm diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-15-0.md similarity index 85% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-15-0.md index 02ab297c07a..5eba212590f 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-15-0.md @@ -1,10 +1,16 @@ --- title: "Upgrading to 0.15.0" id: "upgrading-to-0-15-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + The dbt v0.15.0 release contains a handful of breaking code changes for users upgrading from v0.14.0. + ## Breaking changes ### Stricter YML compilation @@ -20,7 +26,7 @@ expect this field will now return errors. See the latest ### Custom materializations -All materializations must now [manage dbt's Relation cache](/guides/advanced/creating-new-materializations#update-the-relation-cache). +All materializations must now manage dbt's Relation cache. For more information, refer to [Create new materializations](/guides/create-new-materializations). ### dbt Server diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-16-0.md similarity index 98% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-16-0.md index a34f23c4c89..076e6fc4e88 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-16-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.16.0" id: "upgrading-to-0-16-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + dbt v0.16.0 contains many new features, bug fixes, and improvements. This guide covers all of the important information to consider when upgrading from an earlier version of dbt to 0.16.0. diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-17-0.md similarity index 98% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-17-0.md index 1f891ebc0f4..5b863777df9 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-17-0.md @@ -1,9 +1,14 @@ --- title: "Upgrading to 0.17.0" id: "upgrading-to-0-17-0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + dbt v0.17.0 makes compilation more consistent, improves performance, and fixes a number of bugs. ## Articles: diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-18-0.md similarity index 97% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-18-0.md index 8092ad807b8..545bfd41ac6 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-18-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.18.0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/dev/marian-anderson/CHANGELOG.md) diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-19-0.md similarity index 96% rename from website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md rename to website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-19-0.md index 0dd428780e0..db825d8af9c 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md +++ b/website/docs/docs/dbt-versions/core-upgrade/11-Older versions/upgrading-to-0-19-0.md @@ -1,8 +1,13 @@ --- title: "Upgrading to 0.19.0" +displayed_sidebar: "docs" --- +import UpgradeMove from '/snippets/_upgrade-move.md'; + + + ### Resources - [Discourse](https://discourse.getdbt.com/t/1951) @@ -23,7 +28,7 @@ See the docs below for more details. We don't expect these to require action in #### Deprecations -Removed support for `config-version: 1` of dbt_project.yml, which was deprecated in v0.17.0. Use `config-version: 2` in all projects and installed packages. Otherwise, dbt will raise an error. See docs on [config-version](/reference/project-configs/config-version) and the [v0.17.0 Migration Guide](/guides/migration/versions) for details. +Removed support for `config-version: 1` of dbt_project.yml, which was deprecated in v0.17.0. Use `config-version: 2` in all projects and installed packages. Otherwise, dbt will raise an error. See docs on [config-version](/reference/project-configs/config-version) and the [v0.17.0 Migration Guide](/docs/dbt-versions/core-upgrade) for details. ### For dbt plugin maintainers diff --git a/website/docs/docs/dbt-versions/core-versions.md b/website/docs/docs/dbt-versions/core-versions.md index 2a5ce6daeb7..2467f3c946b 100644 --- a/website/docs/docs/dbt-versions/core-versions.md +++ b/website/docs/docs/dbt-versions/core-versions.md @@ -2,6 +2,8 @@ title: "About dbt Core versions" id: "core" description: "Learn about semantic versioning for dbt Core, and how long those versions are supported." +pagination_next: "docs/dbt-versions/upgrade-core-in-cloud" +pagination_prev: null --- dbt Core releases follow [semantic versioning](https://semver.org/) guidelines. For more on how we use semantic versions, see [How dbt Core uses semantic versioning](#how-dbt-core-uses-semantic-versioning). @@ -82,7 +84,7 @@ Like many software projects, dbt Core releases follow [semantic versioning](http We are committed to avoiding breaking changes in minor versions for end users of dbt. There are two types of breaking changes that may be included in minor versions: -- Changes to the [Python interface for adapter plugins](/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter). These changes are relevant _only_ to adapter maintainers, and they will be clearly communicated in documentation and release notes. +- Changes to the Python interface for adapter plugins. These changes are relevant _only_ to adapter maintainers, and they will be clearly communicated in documentation and release notes. For more information, refer to [Build, test, document, and promote adapters](/guides/adapter-creation) guide. - Changes to metadata interfaces, including [artifacts](/docs/deploy/artifacts) and [logging](/reference/events-logging), signalled by a version bump. Those version upgrades may require you to update external code that depends on these interfaces, or to coordinate upgrades between dbt orchestrations that share metadata, such as [state-powered selection](/reference/node-selection/syntax#about-node-selection). ### How we version adapter plugins diff --git a/website/docs/docs/dbt-versions/experimental-features.md b/website/docs/docs/dbt-versions/experimental-features.md index 5ed0cf037ca..a621bd4ac44 100644 --- a/website/docs/docs/dbt-versions/experimental-features.md +++ b/website/docs/docs/dbt-versions/experimental-features.md @@ -3,6 +3,7 @@ title: "Preview new and experimental features in dbt Cloud" id: "experimental-features" sidebar_label: "Preview new dbt Cloud features" description: "Gain early access to many new dbt Labs experimental features by enabling this in your profile." +pagination_next: null --- dbt Labs often tests experimental features before deciding to continue on the [Product lifecycle](https://docs.getdbt.com/docs/dbt-versions/product-lifecycles#dbt-cloud). diff --git a/website/docs/docs/dbt-versions/release-notes.md b/website/docs/docs/dbt-versions/release-notes.md index db25af163ae..6f7be90e60d 100644 --- a/website/docs/docs/dbt-versions/release-notes.md +++ b/website/docs/docs/dbt-versions/release-notes.md @@ -2,6 +2,8 @@ title: "About dbt Cloud Release Notes" id: "dbt-cloud-release-notes" description: "Release notes for dbt Cloud" +pagination_next: null +pagination_prev: null --- dbt provides release notes for dbt Cloud so you can see recent and historical changes. Generally, you'll see release notes for these changes: diff --git a/website/docs/docs/dbt-versions/release-notes/02-Nov-2023/job-notifications-rn.md b/website/docs/docs/dbt-versions/release-notes/02-Nov-2023/job-notifications-rn.md new file mode 100644 index 00000000000..660129513d7 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/02-Nov-2023/job-notifications-rn.md @@ -0,0 +1,17 @@ +--- +title: "Enhancement: Email and Slack job notifications" +description: "November 2023: New quality-of-life improvements for setting up and administering email and Slack job notifications" +sidebar_label: "Enhancement: Job notifications" +sidebar_position: 10 +tags: [Nov-2023] +--- + +There are new quality-of-life improvements in dbt Cloud for email and Slack notifications about your jobs: + +- You can add external email addresses and send job notifications to them. External emails can be: + - Addresses that are outside of your dbt Cloud account + - Third-party integration addresses for configuring notifications to services like Microsoft Teams or PagerDuty +- You can configure notifications for multiple Slack channels. Previously, you could only configure one Slack channel. +- Any account admin can now edit slack notifications, not just the person who created them. + +To learn more, check out [Job notifications](/docs/deploy/job-notifications). \ No newline at end of file diff --git a/website/docs/docs/dbt-versions/release-notes/02-Nov-2023/microsoft-fabric-support-rn.md b/website/docs/docs/dbt-versions/release-notes/02-Nov-2023/microsoft-fabric-support-rn.md new file mode 100644 index 00000000000..13aefa80ffc --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/02-Nov-2023/microsoft-fabric-support-rn.md @@ -0,0 +1,18 @@ +--- +title: "New: Public Preview of Microsoft Fabric support in dbt Cloud" +description: "November 2023: Public Preview now available for Microsoft Fabric in dbt Cloud" +sidebar_label: "New: Public Preview of Microsoft Fabric support" +sidebar_position: 09 +tags: [Nov-2023] +--- + +Public Preview is now available in dbt Cloud for Microsoft Fabric! + +To learn more, check out the [Quickstart for dbt Cloud and Microsoft Fabric](/guides/microsoft-fabric?step=1). The guide walks you through: + +- Loading the Jaffle Shop sample data (provided by dbt Labs) into your Microsoft Fabric warehouse. +- Connecting dbt Cloud to Microsoft Fabric. +- Turning a sample query into a model in your dbt project. A model in dbt is a SELECT statement. +- Adding tests to your models. +- Documenting your models. +- Scheduling a job to run. \ No newline at end of file diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/api-v2v3-limit.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/api-v2v3-limit.md new file mode 100644 index 00000000000..9768886d5fb --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/api-v2v3-limit.md @@ -0,0 +1,15 @@ +--- +title: "API results limited to `100`" +id: apiv3-limit" +description: "Oct 2023: In order to enhance the efficiency and stability of our services, we will limit all API results to `100` records. This limit is applicable to multi-tenant instances only." +sidebar_label: "Update: API results limited to `100`" +sidebar_position: 04 +tags: [Oct-2023, API] +--- + + +Beginning December 1, 2023, the [Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) v2 and v3 will expect you to limit all "list" or `GET` API methods to 100 results per API request. This limit enhances the efficiency and stability of our services. If you need to handle more than 100 results, then use the `limit` and `offset` query parameters to paginate those results; otherwise, you will receive an error. + +This maximum limit applies to [multi-tenant instances](/docs/cloud/about-cloud/regions-ip-addresses) only, and _does not_ apply to single tenant instances. + +Refer to the [API v3 Pagination](https://docs.getdbt.com/dbt-cloud/api-v3#/) or [API v2 Pagination](https://docs.getdbt.com/dbt-cloud/api-v2#/) sections for more information on how to paginate your API responses. diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/cloud-cli-pp.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/cloud-cli-pp.md new file mode 100644 index 00000000000..d96b82636f8 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/cloud-cli-pp.md @@ -0,0 +1,31 @@ +--- +title: "New: dbt Cloud CLI in Public Preview" +description: "October 2023: Learn about the new dbt Cloud CLI development experience, now in public preview," +sidebar_position: 04 +sidebar_label: "New: dbt Cloud CLI in Public Preview" +tags: [Oct-2023, CLI, dbt Cloud] +date: 2023-10-17 +--- + +We are excited to announce the dbt Cloud CLI, **unified command line for dbt**, is available in public preview. It’s a local development experience, powered by dbt Cloud. It’s easy to get started: `pip3 install dbt` or `brew install dbt` and you’re ready to go. + +We will continue to invest in the dbt Cloud IDE as the easiest and most accessible way to get started using dbt, especially for data analysts who have never developed software using the command line before. We will keep improving the speed, stability, and feature richness of the IDE, as we have been [all year long](https://www.getdbt.com/blog/improvements-to-the-dbt-cloud-ide/). + +We also know that many people developing in dbt have a preference for local development, where they can use their favorite terminal, text editor, keybindings, color scheme, and so on. This includes people with data engineering backgrounds, as well as those analytics engineers who started writing code in the dbt Cloud IDE and have expanded their skills. + +The new dbt Cloud CLI offers the best of both worlds, including: + +- The power of developing against the dbt Cloud platform +- The flexibility of your own local setup + +Run whichever community-developed plugins, pre-commit hooks, or other arbitrary scripts you like. + +Some of the unique capabilities of this dbt Cloud CLI include: + +- Automatic deferral of build artifacts to your Cloud project's production environment +- Secure credential storage in the dbt Cloud platform +- Support for dbt Mesh ([cross-project `ref`](/docs/collaborate/govern/project-dependencies)) +- Development workflow for dbt Semantic Layer +- Speedier, lower cost builds + +Refer to [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) to learn more. diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/custom-branch-fix-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/custom-branch-fix-rn.md new file mode 100644 index 00000000000..06550b7d863 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/custom-branch-fix-rn.md @@ -0,0 +1,14 @@ +--- +title: "Fix: Default behavior for CI job runs without a custom branch" +description: "October 2023: CI job runs now default to the main branch of the Git repository when a custom branch isn't set" +sidebar_label: "Fix: Default behavior for CI job runs without a custom branch" +tags: [Oct-2023, CI] +date: 2023-10-06 +sidebar_position: 08 +--- + +If you don't set a [custom branch](/docs/dbt-cloud-environments#custom-branch-behavior) for your dbt Cloud environment, it now defaults to the default branch of your Git repository (for example, `main`). Previously, [CI jobs](/docs/deploy/ci-jobs) would run for pull requests (PRs) that were opened against _any branch_ or updated with new commits if the **Custom Branch** option wasn't set. + +## Azure DevOps + +Your Git pull requests (PRs) might not trigger against your default branch if you're using Azure DevOps and the default branch isn't `main` or `master`. To resolve this, [set up a custom branch](/faqs/Environments/custom-branch-settings) with the branch you want to target. diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/dbt-deps-auto-install.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/dbt-deps-auto-install.md new file mode 100644 index 00000000000..80963a9d550 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/dbt-deps-auto-install.md @@ -0,0 +1,21 @@ +--- +title: "Enhancement: dbt Cloud auto-installs 'dbt deps' on startup" +description: "October 2023 :The dbt Cloud IDE and dbt Cloud CLI auto-handles 'dbt deps' on startup; manual run needed for 'packages.yml' changes. Available for multi-tenant users (single-tenant support coming soon) and applies to all dbt versions." +sidebar_label: "Enhancement: dbt Cloud auto-installs 'dbt deps' on startup" +tags: [Oct-2023, IDE] +date: 2023-10-17 +sidebar_position: 06 +--- + +The dbt Cloud IDE and dbt Cloud CLI now automatically installs `dbt deps` when your environment starts or when necessary. Previously, it would prompt you to run `dbt deps` during initialization. + +This improved workflow is available to all multi-tenant dbt Cloud users (Single-tenant support coming next week) and applies to dbt versions. + +However, you should still run the `dbt deps` command in these situations: + +- When you make changes to the `packages.yml` or `dependencies.yml` file during a session +- When you update the package version in the `packages.yml` or `dependencies.yml` file. +- If you edit the `dependencies.yml` file and the number of packages remains the same, run `dbt deps`. (Note that this is a known bug dbt Labs will fix in the future.) + + + diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/explorer-public-preview-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/explorer-public-preview-rn.md new file mode 100644 index 00000000000..ebf5add8d03 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/explorer-public-preview-rn.md @@ -0,0 +1,13 @@ +--- +title: "New: dbt Explorer Public Preview" +description: "October 2023: dbt Explorer is now available in Public Preview. You can use it to understand, improve, and leverage your dbt projects." +sidebar_label: "New: dbt Explorer Public Preview" +tags: [Oct-2023, Explorer] +date: 2023-10-13 +sidebar_position: 07 +--- + +On Oct 17, 2023, a Public Preview of dbt Explorer will become available to dbt Cloud customers. With dbt Explorer, you can view your project's resources (such as models, tests, and metrics) and their lineage — including interactive DAGs — to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. + +For details, refer to [Explore your dbt projects](/docs/collaborate/explore-projects). + diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/native-retry-support-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/native-retry-support-rn.md new file mode 100644 index 00000000000..20e56879940 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/native-retry-support-rn.md @@ -0,0 +1,15 @@ +--- +title: "Enhancement: Native support for the dbt retry command" +description: "October 2023: Rerun errored jobs from start or from the failure point" +sidebar_label: "Enhancement: Support for dbt retry" +tags: [Oct-2023, Scheduler] +date: 2023-10-06 +sidebar_position: 10 +--- + +Previously in dbt Cloud, you could only rerun an errored job from start but now you can also rerun it from its point of failure. + +You can view which job failed to complete successully, which command failed in the run step, and choose how to rerun it. To learn more, refer to [Retry jobs](/docs/deploy/retry-jobs). + + + diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/product-docs-sept-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/product-docs-sept-rn.md new file mode 100644 index 00000000000..3fdaa0eafe8 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/product-docs-sept-rn.md @@ -0,0 +1,38 @@ +--- +title: "September 2023 product docs updates" +id: "product-docs-sept" +description: "September 2023: The Product docs team merged 107 PRs, made various updates to dbt Cloud and Core, such as GAing continuous integration jobs, Semantic Layer GraphQL API doc, a new community plugin, and more" +sidebar_label: "Update: Product docs changes" +tags: [Sept-2023, product-docs] +date: 2023-10-10 +sidebar_position: 09 +--- + +Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun! First, we’d like to thank the 15 new community contributors to docs.getdbt.com. We merged [107 PRs](https://github.com/dbt-labs/docs.getdbt.com/pulls?q=is%3Apr+merged%3A2023-09-01..2023-09-31) in September. + +Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/): + +* Migrated docs.getdbt.com from Netlify to Vercel. + +## ☁ Cloud projects +- Continuous integration jobs are now generally available and no longer in beta! +- Added [Postgres PrivateLink set up page](/docs/cloud/secure/postgres-privatelink) +- Published beta docs for [dbt Explorer](/docs/collaborate/explore-projects). +- Added a new Semantic Layer [GraphQL API doc](/docs/dbt-cloud-apis/sl-graphql) and updated the [integration docs](/docs/use-dbt-semantic-layer/avail-sl-integrations) to include Hex. Responded to dbt community feedback and clarified Metricflow use cases for dbt Core and dbt Cloud. +- Added an [FAQ](/faqs/Git/git-migration) describing how to migrate from one git provider to another in dbt Cloud. +- Clarified an example and added a [troubleshooting section](/docs/cloud/connect-data-platform/connect-snowflake#troubleshooting) to Snowflake connection docs to address common errors and provide solutions. + + +## 🎯 Core projects + +- Deprecated dbt Core v1.0 and v1.1 from the docs. +- Added configuration instructions for the [AWS Glue](/docs/core/connect-data-platform/glue-setup) community plugin. +- Revised the dbt Core quickstart, making it easier to follow. Divided this guide into steps that align with the [other guides](/guides/manual-install?step=1). + +## New 📚 Guides, ✏️ blog posts, and FAQs + +Added a [style guide template](/best-practices/how-we-style/6-how-we-style-conclusion#style-guide-template) that you can copy & paste to make sure you adhere to best practices when styling dbt projects! + +## Upcoming changes + +Stay tuned for a flurry of releases in October and a filterable guides section that will make guides easier to find! diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/sl-ga.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/sl-ga.md new file mode 100644 index 00000000000..a81abec5d42 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/sl-ga.md @@ -0,0 +1,27 @@ +--- +title: "Update: dbt Cloud Semantic Layer is Generally Available" +description: "October 2023: dbt Cloud Semantic Layer is Generally Available for all users" +sidebar_label: "Update: dbt Cloud Semantic Layer is GA" +sidebar_position: 05 +date: 2023-10-17 +tags: [Oct-2023] +--- + +:::important +If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher and [migrate](/guides/sl-migration) to the latest Semantic Layer. +::: + +dbt Labs is thrilled to announce that the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) is now generally available. It offers consistent data organization, improved governance, reduced costs, enhanced efficiency, and accessible data for better decision-making and collaboration across organizations. + +It aims to bring the best of modeling and semantics to downstream applications by introducing: + +- Brand new [integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) such as Tableau, Google Sheets, Hex, Mode, and Lightdash. +- New [Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) using GraphQL and JDBC to query metrics and build integrations. +- dbt Cloud [multi-tenant regional](/docs/cloud/about-cloud/regions-ip-addresses) support for North America, EMEA, and APAC. Single-tenant support coming soon. +- Use the APIs to call an export (a way to build tables in your data platform), then access them in your preferred BI tool. Starting from dbt v1.7 or higher, you will be able to schedule exports as part of your dbt job. + + + +The dbt Semantic Layer is available to [dbt Cloud Team or Enterprise](https://www.getdbt.com/) multi-tenant plans on dbt v1.6 or higher. +- Team and Enterprise customers can use 1,000 Queried Metrics per month for no additional cost on a limited trial basis, subject to reasonable use limitations. Refer to [Billing](/docs/cloud/billing#what-counts-as-a-queried-metric) for more information. +- dbt Cloud Developer plans and dbt Core users can define metrics but won't be able to query them with integrated tools. diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md new file mode 100644 index 00000000000..a8ae1ade65b --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md @@ -0,0 +1,42 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "September 2023: dbt Cloud now has two types of jobs -- deploy jobs and CI jobs -- with streamlined setup and improved efficiency. " +sidebar_label: "Update: Improvements to dbt jobs" +tags: [Sept-2023, CI] +date: 2023-09-11 +sidebar_position: 10 +--- + +dbt Cloud now has two distinct job types: [deploy jobs](/docs/deploy/deploy-jobs) for building production data assets, and [continuous integration (CI) jobs](/docs/deploy/ci-jobs) for checking code changes. These jobs perform fundamentally different tasks so dbt Labs improved the setup experience with better defaults for each. + +With two types of jobs, instead of one generic type, we can better guide you through the setup flow. Best practices are built into the default settings so you can go from curious to being set up in seconds. + + + +And, we now have more efficient state comparisons on CI checks: never waste a build or test on code that hasn’t been changed. We now diff between the Git pull request (PR) code and what’s running in production more efficiently with the introduction of deferral to an environment versus a job. To learn more, refer to [Continuous integration in dbt Cloud](/docs/deploy/continuous-integration). + +Below is a comparison table that describes how deploy jobs and CI jobs behave differently: + +| | Deploy Jobs | CI Jobs | +| --- | --- | --- | +| Purpose | Builds production data assets. | Builds and tests new code before merging changes into production. | +| Trigger types | Triggered by a schedule or by API. | Triggered by a commit to a PR or by API. | +| Destination | Builds into a production database and schema. | Builds into a staging database and ephemeral schema, lived for the lifetime of the PR. | +| Execution mode | Runs execute sequentially, so as to not have collisions on the underlying DAG. | Runs execute in parallel to promote team velocity. | +| Efficiency run savings | Detects over-scheduled jobs and cancels unnecessary runs to avoid queue clog. | Cancels existing runs when a newer commit is pushed to avoid redundant work. | +| State comparison | Only sometimes needs to detect state. | Almost always needs to compare state against the production environment to build on modified code and its dependents. | + + +## What you need to update + +- If you want to set up a CI environment for your jobs, dbt Labs recommends that you create your CI job in a dedicated [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. To learn more about these environment best practices, refer to the guide [Get started with continuous integration tests](/guides/set-up-ci). + +- If you had set up a CI job before October 2, 2023, the job might've been misclassified as a deploy job with this update. Below describes how to fix the job type: + + If you used the [Create Job](/dbt-cloud/api-v2#/operations/Create%20Job) API endpoint but didn't set `"triggers":triggers.git_provider_webhook`, the job was misclassified as a deploy job and you must re-create it as described in [Trigger a CI job with the API](/docs/deploy/ci-jobs#trigger-a-ci-job-with-the-api). + + If you used the dbt Cloud UI but didn't enable the **Run on Pull Requests** option that was in the **Continuous Integration** (CI) tab, the job was misclassified as a deploy job and you must re-create it as described in [Set up CI jobs](/docs/deploy/ci-jobs#set-up-ci-jobs). + + To check for the job type, review your CI jobs in dbt Cloud's [Run History](/docs/deploy/run-visibility#run-history) and check for the **CI Job** tag below the job name. If it doesn't have this tag, it was misclassified and you need to re-create the job. + + diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md new file mode 100644 index 00000000000..174de2bdaaf --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md @@ -0,0 +1,16 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "September 2023: Improved deletion of temporary schemas" +sidebar_label: "Update: Improved automatic deletion of temporary schemas" +tags: [Sept-2023, CI] +date: 2023-09-18 +sidebar_position: 08 +--- + +Temporary schemas are now being automatically deleted (dropped) for all adapters (like Databricks), PrivateLink connections, and environment variables in connection strings. + +dbt Labs has rearchitected how schema deletion works for [continuous integration (CI)](/docs/deploy/continuous-integration) runs. We created a new service to delete any schema with a prefix of `dbt_cloud_pr_` that's been generated by a PR run. + +However, temporary schemas will not be automatically deleted if: +- Your project overrides the [generate_schema_name macro](/docs/build/custom-schemas) but it doesn't contain the required prefix `dbt_cloud_pr_`. For details, refer to [Troubleshooting](/docs/deploy/ci-jobs#troubleshooting). +- You're using a [non-native Git integration](/docs/deploy/ci-jobs#trigger-a-ci-job-with-the-api). This is because automatic deletion relies on incoming webhooks from Git providers, which is only available through the native integrations. diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md new file mode 100644 index 00000000000..e8fb9539c50 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md @@ -0,0 +1,43 @@ +--- +title: "Summer 2023 product docs updates" +id: "product-docs-summer" +description: "Summer 2023: The Product docs team merged 256 PRs, made various updates to dbt Cloud and Core, such as adding What's New, writing Semantic Layer beta docs, releasing dbt 1.6 docs, and more!" +sidebar_label: "Update: Product docs changes" +tags: [July-2023, Aug-2023, product-docs] +date: 2023-09-13 +sidebar_position: 09 +--- + +Hello from dbt's Product Documentation team (the stewards of the docs.getdbt.com site): @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun. What a busy summer! We merged 256 PRs between July 1st and August 31. + +We'd like to recognize all of the docs and support from our partner team, Developer Experience: @jasnonaz @gwenwindflower @dbeatty10 @dataders @joellabes @Jstein77 @dave-connors-3! + +We'd also like to give a special thanks to the 22 community members who contributed to the [dbt Product docs](https://docs.getdbt.com) for the first time. :pray: Based on feedback from the dbt community, we made these changes: + +- Added a [permissions table](/docs/cloud/manage-access/enterprise-permissions) for Enterprise accounts +- Added a [browser session page](/docs/cloud/about-cloud/browsers#browser-sessions) that clarifies dbt Cloud’s browser session time and when it logs users off. + +You can provide feedback by opening a pull request or issue in [our repo](https://github.com/dbt-labs/docs.getdbt.com) or reaching out in the dbt community Slack channel [#dbt-product-docs](https://getdbt.slack.com/archives/C0441GSRU04)). + +## :zap: General docs projects + +* Added the ability to collapse sections you’re not currently looking at. There were quite a few people who wanted this, and it bugged us too, so we were happy to get this shipped! +* Introduced the idea of [“Trusted” adapters](/docs/supported-data-platforms#types-of-adapters). + +## ☁ Cloud projects + +* The **What’s new?** product update widget is back in the dbt Cloud UI! The Docs team will begin updating the content to keep you informed about new features. +* Launched the re-released [Semantic Layer beta docs](/docs/use-dbt-semantic-layer/dbt-sl), which introduces users to the new API, new guide to set up MetricFlow and the new Semantic Layer, as well as revamp the ‘Use the dbt Semantic Layer’ section for users. +* Updated [Admin API v2 and v3](/docs/dbt-cloud-apis/admin-cloud-api) to help you understand the differences between them and which version includes the endpoints you use. +* To improve discoverability, the docs team made changes to the [deploy dbt sidebar](/docs/deploy/deployments). We added cards and aligned better with the dbt Cloud UI and the way it’s used. +* Deprecated legacy job schemas in the [Discovery API](/docs/dbt-cloud-apis/discovery-api). +* Added a page to describe [experimental and beta features](/docs/dbt-versions/experimental-features) in dbt Cloud and what you need to know about them. +* Added a section to introduce a new beta feature [**Extended Attributes**](/docs/dbt-cloud-environments#extended-attributes-beta), which allows users to set a flexible `profiles.yml` snippet in their dbt Cloud Environment settings. +## 🎯 Core projects + +* We released [dbt 1.6](/docs/dbt-versions/core-upgrade/upgrading-to-v1.6)! We added docs for the new commands `dbt retry` and `dbt clone` + +## New 📚 Guides, ✏️ blog posts, and FAQs +* Check out how these community members use the dbt community in the [Community spotlight](/community/spotlight). +* Blog posts published this summer include [Optimizing Materialized Views with dbt](/blog/announcing-materialized-views), [Data Vault 2.0 with dbt Cloud](/blog/data-vault-with-dbt-cloud), and [Create dbt Documentation and Tests 10x faster with ChatGPT](/blog/create-dbt-documentation-10x-faster-with-chatgpt) +- We now have two new best practice guides: [How we build our metrics](/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) and [Set up Continuous Integration](/guides/set-up-ci). diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md new file mode 100644 index 00000000000..0b588376c34 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md @@ -0,0 +1,15 @@ +--- +title: "Update: Removing old (prerelease) versions of dbt from dbt Cloud when (latest) is available" +description: "Sept 2023: Improving the version selection options by removing prerelease versions whenever the latest version is available." +sidebar_label: "Update: Removing old prerelease versions from dbt Cloud" +tags: [Sept-2023, Versions] +date: 2023-09-26 +sidebar_position: 07 +--- + +Previously, when dbt Labs released a new [version](/docs/dbt-versions/core#how-dbt-core-uses-semantic-versioning) in dbt Cloud, the older patch _prerelease_ version and the _latest_ version remained as options in the dropdown menu available in the **Environment settings**. Now, when the _latest_ version is released, the _prerelease_ version will be removed and all customers remaining on it will be migrated seamlessly. There will be no interruptions to service when this migration occurs. + +To see which version you are currently using and to upgrade, select **Deploy** in the top navigation bar and select **Environments**. Choose the preferred environment and click **Settings**. Click **Edit** to make a change to the current dbt version. dbt Labs recommends always using the latest version whenever possible to take advantage of new features and functionality. + + + \ No newline at end of file diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md new file mode 100644 index 00000000000..cd088b92fab --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md @@ -0,0 +1,126 @@ +--- +title: "Deprecation: Query patterns and endpoints in the dbt Cloud Discovery API" +description: "August 2023: Learn about the upcoming deprecation of certain endpoints and query patterns in the Discovery API." +sidebar_position: 6 +sidebar_label: "Deprecation: Certain Discovery API endpoints and query patterns" +tags: [Aug-2023, API] +date: 2023-08-31 +--- + +dbt Labs has deprecated and will be deprecating certain query patterns and replacing them with new conventions to enhance the performance of the dbt Cloud [Discovery API](/docs/dbt-cloud-apis/discovery-api). + +All these changes will be in effect on _September 7, 2023_. + +We understand that these changes might require adjustments to your existing integration with the Discovery API. Please [contact us](mailto:support@getdbt.com) with any questions. We're here to help you during this transition period. + +## Job-based queries + +Job-based queries that use the data type `Int` for IDs will be deprecated. They will be marked as deprecated in the [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql). The new convention will be for you to use the data type `BigInt` instead. + +This change will be in effect starting September 7, 2023. + + +Example of query before deprecation: + +```graphql +query ($jobId: Int!) { + models(jobId: $jobId){ + uniqueId + } +} +``` + +Example of query after deprecation: + +```graphql +query ($jobId: BigInt!) { + job(id: $jobId) { + models { + uniqueId + } + } +} +``` + +## modelByEnvironment queries + +The `modelByEnvironment` object has been renamed and moved into the `environment` object. This change is in effect and has been since August 15, 2023. + +Example of query before deprecation: + +```graphql +query ($environmentId: Int!, $uniqueId: String) { + modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } +} +``` + +Example of query after deprecation: + +```graphql +query ($environmentId: BigInt!, $uniqueId: String) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } + } + } +} +``` + + +## Environment and account queries + +Environment and account queries that use `Int` as a data type for ID have been deprecated. IDs must now be in `BigInt`. This change is in effect and has been since August 15, 2023. + + +Example of query before deprecation: + +```graphql +query ($environmentId: Int!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + + +Example of query after deprecation: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + + diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md new file mode 100644 index 00000000000..10baa5cd6d7 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md @@ -0,0 +1,38 @@ +--- +title: "Update: Cloud IDE v1.2 includes a new service" +description: "August 2023: Cloud IDE now uses dbt-server to provide more reliable service and dbt Core feature parity, including support for commands like `dbt list`." +sidebar_label: "Update: Cloud IDE v1.2" +tags: [Aug-2023, IDE] +date: 2023-08-03 +sidebar_position: 8 +--- + +We're excited to announce that we replaced the backend service that powers the Cloud IDE with a more reliable server -- dbt-server. Because this release contains foundational changes, IDE v1.2 requires dbt v1.6 or higher. This significant update follows the rebuild of the IDE frontend last year. We're committed to improving the IDE to provide you with a better experience. + +Previously, the Cloud IDE used dbt-rpc, an outdated service that was unable to stay up-to-date with changes from dbt-core. The dbt-rpc integration used legacy dbt-core entry points and logging systems, causing it to be sluggish, brittle, and poorly tested. The Core team had been working around this outdated technology to avoid breaking it, which prevented them from developing with velocity and confidence. + +## New features + +- **Better dbt-core parity:** The Cloud IDE has better command parity with dbt-core, including support for commands like `dbt list` and improved treatment of flags like `--vars`, `--fail-fast`, etc. +- **Improved maintainability:** With the new dbt-server, it's easier to fix bugs and improve the overall quality of the product. With dbt-rpc, fixing bugs was a time-consuming and challenging process that required extensive testing. With the new service, we can identify and fix bugs more quickly, resulting in a more stable and reliable IDE. +- **A more reliable service:** Simplified architecture that's less prone to failure. + +### Product refinements + +- Improved `Preview` capabilities with Core v1.6 + IDE v1.2. [This Loom](https://www.loom.com/share/12838feb77bf463c8585fc1fc6aa161b) provides more information. + +### Bug fixes + +- Global page can become "inert" and stop handling clicks +- Switching back and forth between files in the git diff view can cause overwrite +- Browser gets stuck during markdown preview for doc with large table +- Editor right click menu is offset +- Unable to Cancel on the Save New File component when Closing All Files in the IDE +- Mouse flicker in the modal's file tree makes it difficult to select a folder where you want to save a new file +- Snapshots not showing in Lineage when inside a subfolder and is mixed cased named +- Tooltips do not work for Format and Save +- When a dbt invocation is in progress or if parsing is ongoing, attempting to switch branches will cause the `Git Branch` dropdown to close automatically + +### Known issues + +- `{{this}}` function does not display properly in preview/compile with dbt-server diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md new file mode 100644 index 00000000000..f44fd57aa4a --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md @@ -0,0 +1,65 @@ +--- +title: "Enhancement: Revamped dbt Semantic Layer available in public beta" +description: "August 2023: The revamped dbt Semantic Layer, now available in public beta, introduces new semantic components and evolves the semantic layer's capability." +sidebar_label: "Enhancement: Revamped dbt Semantic Layer in public beta" +tags: [Aug-2023, dbt Semantic Layer] +date: 2023-08-03 +sidebar_position: 7 +--- + +:::important +If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/sl-migration) for more info. +::: + +dbt Labs are thrilled to announce the re-release of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), now available in [public beta](#public-beta). It aims to bring the best of modeling and semantics to downstream applications by introducing: + +- [MetricFlow](/docs/build/about-metricflow) is a framework for constructing performant and legible SQL from an all new set of semantic constructs which include semantic models, entities, and metrics. +- New Semantic Layer infrastructure that enables support for more data platforms (Snowflake, Databricks, BigQuery, Redshift, and soon more), along with improved performance. +- New and improved [developer workflows](/guides/sl-migration), governance, and collaboration features. +- New [Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) using JDBC to query metrics and build integrations. + +With semantics at its core, the dbt Semantic Layer marks a crucial milestone towards a new era of centralized logic and data applications. + + + +## Enhanced dbt Semantic Layer + +What sets the dbt Semantic Layer apart is its ability to centralize logic for many downstream data applications, streamlining access and governance and enabling more efficient utilization of data models. It provides a consistent view of data while simplifying complex tasks in downstream applications and reducing the costs of and barriers to data access. + +We are excited to present several important capabilities with the enhanced dbt Semantic Layer: + +- **Consistent organization**: Provides a consistent view of data, ensuring that metrics and definitions match across the organization and the breadth of interfaces where data is consumed. This fosters trust in data and drives better decision-making by eliminating inconsistencies and errors that come up when individual users define metrics independently. + +- **Improved governance**: The dbt Semantic Layer ensures proper governance and auditing of data changes, providing an auditable record of modifications and clear ownership. This saves time by making it clear who can create and manage new metrics, ensuring accountability and data integrity. + +- **Reduce costs**: The dbt Semantic Layer simplifies complex tasks, such as bridging entities across a semantic graph. Often users duplicate slices and dice of data and make them available in a data platform, making it difficult to manage and causing high computation. The dbt Semantic Layer minimizes duplication of work and reduces computational costs - allowing users to focus on analyzing data rather than navigating intricate technical processes or duplicating work. + +- **Enhanced efficiency**: With the dbt Semantic Layer, data teams can create and update metrics using a new set of validations that make defining and iterating on metrics efficient. The streamlined development workflows makes it simpler for a data team to serve large organizations with broad data needs. + +- **Accessible data**: Defining common metrics and dimensions and making them joinable, makes access simpler for users with less expertise in the specifics of a company's data modeling work. This creates opportunities to leverage data insights, fostering collaboration and driving innovation in a more inclusive data environment. + +By bringing these enhancements to the dbt Semantic Layer, we enable organizations of all sizes and industries to leverage the power of semantics in their data workflows. + +## Public beta + +The dbt Semantic Layer is currently available as a public beta, which means: + +- **Who** — To experience the new dbt Semantic Layer, you must be on a dbt Cloud [Team and Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plan, [hosted](/docs/cloud/about-cloud/regions-ip-addresses) in North America and on dbt v1.6 and higher. Look out for announcements on removing the location requirement soon. + + - Developer plans or dbt Core users can use MetricFlow to define and test metrics using the dbt MetricFlow CLI only. + +- **What** — Public beta provides early access to new features. The dbt Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to general availability later this year. We may also introduce new functionality that isn't backwards compatible. We provide support, and relevant service level objectives (SLOs) apply. If you have any questions on pricing, please reach out to your account representative. + +- **When** — Public beta starts on August 1st, 2023. + +- **Where** — You can experience the dbt Semantic Layer in dbt Cloud. Public beta is enabled at the account level so you don’t need to worry about enabling it per user. + +## Next steps + +To experience the universal dbt Semantic Layer and its enhanced beta capabilities, check out: + +- [Introducing the new dbt Semantic Layer](https://www.getdbt.com/blog/introducing-new-look-dbt-semantic-layer) +- [dbt Semantic Layer docs](/docs/use-dbt-semantic-layer/dbt-sl) +- [dbt Semantic Layer get started guide](/docs/use-dbt-semantic-layer/quickstart-sl) +- [Build your metrics with MetricFlow](/docs/build/build-metrics-intro) + diff --git a/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md b/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md index 0f88f1d2fa8..ba82234c0b5 100644 --- a/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md +++ b/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md @@ -2,7 +2,7 @@ title: "Enhancement: Faster run starts and unlimited job concurrency" description: "We have enhanced the dbt Cloud Scheduler by reducing prep time for all accounts and provided unlimited job concurrency for Enterprise accounts." sidebar_label: "Enhancement: Faster run starts and unlimited job concurrency" -tags: [07-2023, scheduler] +tags: [July-2023, scheduler] date: 2023-07-06 sidebar_position: 10 --- diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md index c4caf42f355..fa02a6d9bd8 100644 --- a/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md @@ -1,17 +1,17 @@ --- title: "Update: Improvements to dbt Cloud continuous integration" -description: "dbt Cloud's CI checks now run in parallel, will not block production runs, and stale runs are automatically cancelled when a newer commit is pushed." +description: "dbt Cloud's CI checks now run in parallel, will not block production runs, and stale runs are automatically canceled when a newer commit is pushed." sidebar_label: "Update: Improvements to continuous integration" tags: [June-2023, CI] date: 2023-06-20 sidebar_position: 8 --- -dbt Cloud Slim CI is a critical part of the analytics engineering workflow. Large teams rely on process to ensure code quality is high, and they look to dbt Cloud CI to automate testing code changes in an efficient way, enabling speed while keep the bar high. With status checks directly posted to their dbt PRs, developers gain the confidence that their code changes will work as expected in production, and once you’ve grown accustomed to seeing that green status check in your PR, you won’t be able to work any other way. +dbt Cloud CI is a critical part of the analytics engineering workflow. Large teams rely on process to ensure code quality is high, and they look to dbt Cloud CI to automate testing code changes in an efficient way, enabling speed while keep the bar high. With status checks directly posted to their dbt PRs, developers gain the confidence that their code changes will work as expected in production, and once you’ve grown accustomed to seeing that green status check in your PR, you won’t be able to work any other way. -What separates dbt Cloud CI from other CI providers is its ability to keep track of state of what’s running in your production environment, so that when you run a Slim CI job, only the modified data assets in your pull request and their downstream dependencies get built and tested in a staging schema. dbt Cloud aims to make each CI check as efficient as possible, so as to not waste any data warehouse resources. As soon as the Slim CI run completes, its status posts directly back to the PR in GitHub, GitLab, or Azure DevOps, depending on which Git provider you’re using. Teams can set up guardrails to let only PRs with successful CI checks be approved for merging, and the peer review process is greatly streamlined because dbt Cloud does the first testing pass. +What separates dbt Cloud CI from other CI providers is its ability to keep track of state of what’s running in your production environment, so that when you run a CI job, only the modified data assets in your pull request and their downstream dependencies get built and tested in a staging schema. dbt Cloud aims to make each CI check as efficient as possible, so as to not waste any data warehouse resources. As soon as the CI run completes, its status posts directly back to the PR in GitHub, GitLab, or Azure DevOps, depending on which Git provider you’re using. Teams can set up guardrails to let only PRs with successful CI checks be approved for merging, and the peer review process is greatly streamlined because dbt Cloud does the first testing pass. We're excited to introduce a few critical capabilities to dbt Cloud CI that will improve productivity and collaboration in your team’s testing and integration workflow. As of this week, you can now: @@ -21,4 +21,4 @@ We're excited to introduce a few critical capabilities to dbt Cloud CI that will - **Run CI checks without blocking production runs**. CI checks will no longer consume run slots, meaning you can have as many CI checks running as you want, without impeding your production jobs. -To learn more, refer to [Continuous integration](/docs/deploy/continuous-integration) and [Slim CI jobs](/docs/deploy/slim-ci-jobs). +To learn more, refer to [Continuous integration](/docs/deploy/continuous-integration) and [CI jobs](/docs/deploy/ci-jobs). diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md index 9217736a2d8..db73597cd63 100644 --- a/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md @@ -13,11 +13,11 @@ Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/) in June: ## ☁ Cloud projects -- We clarified the nuances of [CI and Slim CI jobs](/docs/deploy/continuous-integration), updated the [Scheduler content](/docs/deploy/job-scheduler), added two new pages for the job settings and run visibility, moved the project state page to the [Syntax page](/reference/node-selection/syntax), and provided a landing page for [Deploying with Cloud](/docs/deploy/dbt-cloud-job) to help readers navigate the content better. +- We clarified the nuances of [CI and CI jobs](/docs/deploy/continuous-integration), updated the [Scheduler content](/docs/deploy/job-scheduler), added two new pages for the job settings and run visibility, moved the project state page to the [Syntax page](/reference/node-selection/syntax), and provided a landing page for [Deploying with Cloud](/docs/deploy/jobs) to help readers navigate the content better. - We reformatted the [Supported data platforms page](/docs/supported-data-platforms) by adding dbt Cloud to the page, splitting it into multiple pages, using cards to display verified adapters, and moving the [Warehouse setup pages](/docs/core/connect-data-platform/about-core-connections) to the Docs section. - We launched a new [Lint and format page](/docs/cloud/dbt-cloud-ide/lint-format), which highlights the awesome new dbt Cloud IDE linting/formatting function. - We enabled a connection between [dbt Cloud release notes](/docs/dbt-versions/dbt-cloud-release-notes) and the dbt Slack community. This means new dbt Cloud release notes are automatically sent to the slack community [#dbt-cloud channel](https://getdbt.slack.com/archives/CMZ2V0X8V) via RSS feed, keeping users up to date with changes that may affect them. -- We’ve added two new docs links in the dbt Cloud Job settings user interface (UI). This will provide additional guidance and help users succeed when setting up a dbt Cloud job: [job commands](/docs/deploy/job-commands) and [job triggers](/docs/deploy/job-triggers). +- We’ve added two new docs links in the dbt Cloud Job settings user interface (UI). This will provide additional guidance and help users succeed when setting up a dbt Cloud job: [job commands](/docs/deploy/job-commands) and job triggers. - We added information related to the newly created [IT license](/docs/cloud/manage-access/about-user-access#license-based-access-control), available for Team and Enterprise plans. - We added a new [Supported browser page](/docs/cloud/about-cloud/browsers), which lists the recommended browsers for dbt Cloud. - We launched a new page informing users of [new Experimental features option](/docs/dbt-versions/experimental-features) in dbt Cloud. @@ -32,4 +32,4 @@ Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/) in June: ## New 📚 Guides, ✏️ blog posts, and FAQs -- Add an Azure DevOps example to the [Customizing CI/CD guide](/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge). +- Add an Azure DevOps example in the [Customizing CI/CD with custom pipelines](/guides/custom-cicd-pipelines) guide. diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md index 5503b40576d..d85ffa154dd 100644 --- a/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md @@ -1,46 +1,46 @@ ---- -title: "May IDE updates and fixes" -id: "may-ide-updates" -description: "May 2023 release note: We've launched SQLFluff in beta, released an IDE UI page, significantly improved IDE performance, improved error messages, fixed bugs, and more." -sidebar_label: "Update and fixes: IDE" -sidebar_position: 2 -tags: [May-2023, IDE] ---- - -To continue improving your [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) development experience, the dbt Labs team continues to work on adding new features, fixing bugs, and increasing reliability ✨. - -Stay up-to-date with [IDE-related changes](/tags/ide). - -## New features -- Lint via SQL Fluff is now available in beta (GA over the next 2-3 weeks) -- Format markdown files with prettier -- Leverage developer experience shortcuts, including ``Ctrl + ` `` (toggle history drawer), `CMD + Option + /` (toggle block comment), `CMD + Shift + P` (open command palette), `Option + W` (close editor tab) -- Display parent folder name for files with same name in Changes section -- Navigate the new IDE features quickly using [the IDE User Interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) help page -- Use `top X` in SQL when previewing in the IDE -- Opt into the new IDE backend layer over the past month (still with dbt-rpc). Ready for beta later in June! - - -## Product refinements - -- Performance-related upgrades: - - Reduced cold start time by 60+% - - Improved render time of modals in the IDE by 98% - - Improved IDE performance with dbt Core v1.5+ (faster and snappier – highly encourage you to [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud)!) -- Upgraded sqlfmt (which powers the Format button) to 0.18.0 -- Updated Build button to change menu options based on file/model type (snapshot, macro, etc.) -- Display message to disable adblocker for file contents error -- Moved Format button to console bar -- Made many security enhancements in the IDE -## Bug fixes - -- File icon sizes no longer get wonky in small screen -- Toast notifications no longer take over command bar menu -- Hover info inside the text editor no longer gets cut off -- Transition between a file and a recently modified scratchpad no longer triggers a console error -- dbt v1.5+ now can access the IDE -- Confirm button on the Unsaved Changes modal now closes after clicking it -- Long node names no longer overflow in the parsed logs section in history drawer -- Status pill in history drawer no longer scales with longer command -- Tooltip for tab name with a long file name is no longer cut off -- Lint button should no longer available in main branch +--- +title: "May IDE updates and fixes" +id: "may-ide-updates" +description: "May 2023 release note: We've launched SQLFluff in beta, released an IDE UI page, significantly improved IDE performance, improved error messages, fixed bugs, and more." +sidebar_label: "Update and fixes: IDE" +sidebar_position: 2 +tags: [May-2023, IDE] +--- + +To continue improving your [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) development experience, the dbt Labs team continues to work on adding new features, fixing bugs, and increasing reliability ✨. + +Stay up-to-date with [IDE-related changes](/tags/ide). + +## New features +- Lint via SQL Fluff is now available in beta (GA over the next 2-3 weeks) +- Format markdown files with prettier +- Leverage developer experience shortcuts, including ``Ctrl + ` `` (toggle history drawer), `CMD + Option + /` (toggle block comment), `CMD + Shift + P` (open command palette), `Option + W` (close editor tab) +- Display parent folder name for files with same name in Changes section +- Navigate the new IDE features quickly using [the IDE User Interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) help page +- Use `top X` in SQL when previewing in the IDE +- Opt into the new IDE backend layer over the past month (still with dbt-rpc). Ready for beta later in June! + + +## Product refinements + +- Performance-related upgrades: + - Reduced cold start time by 60+% + - Improved render time of modals in the IDE by 98% + - Improved IDE performance with dbt Core v1.5+ (faster and snappier – highly encourage you to [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud)!) +- Upgraded sqlfmt (which powers the Format button) to 0.18.0 +- Updated Build button to change menu options based on file/model type (snapshot, macro, etc.) +- Display message to disable adblocker for file contents error +- Moved Format button to console bar +- Made many security enhancements in the IDE +## Bug fixes + +- File icon sizes no longer get wonky in small screen +- Toast notifications no longer take over command bar menu +- Hover info inside the text editor no longer gets cut off +- Transition between a file and a recently modified scratchpad no longer triggers a console error +- dbt v1.5+ now can access the IDE +- Confirm button on the Unsaved Changes modal now closes after clicking it +- Long node names no longer overflow in the parsed logs section in history drawer +- Status pill in history drawer no longer scales with longer command +- Tooltip for tab name with a long file name is no longer cut off +- Lint button should no longer available in main branch diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md index 762a6a723f8..a692c901a80 100644 --- a/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md @@ -16,7 +16,7 @@ Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/) in May: - We made sure everyone knows that Cloud-users don’t need a [profiles.yml file](/docs/core/connect-data-platform/profiles.yml) by adding a callout on several key pages. - Fleshed out the [model jinja variable page](/reference/dbt-jinja-functions/model), which originally lacked conceptual info and didn’t link to the schema page. -- Added a new [Quickstarts landing page](/quickstarts). This new format sets up for future iterations that will include filtering! But for now, we are excited you can step through quickstarts in a focused way. +- Added a new [Quickstarts landing page](/guides). This new format sets up for future iterations that will include filtering! But for now, we are excited you can step through quickstarts in a focused way. ## ☁ Cloud projects diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md index 991fc9be1f4..3de29b605ce 100644 --- a/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md @@ -17,24 +17,24 @@ Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleo ## ☁ Cloud projects - Added Starburst/Trino adapter docs, including: - * [dbt Cloud quickstart guide](/quickstarts/starburst-galaxy),  + * [dbt Cloud quickstart guide](/guides/starburst-galaxy),  * [connection page](/docs/cloud/connect-data-platform/connect-starburst-trino),  * [set up page](/docs/core/connect-data-platform/trino-setup), and [config page](/reference/resource-configs/trino-configs). -- Enhanced [dbt Cloud jobs page](/docs/deploy/dbt-cloud-job) and section to include conceptual info on the queue time, improvements made around it, and about failed jobs. +- Enhanced [dbt Cloud jobs page](/docs/deploy/jobs) and section to include conceptual info on the queue time, improvements made around it, and about failed jobs. - Check out the April dbt [Cloud release notes](/docs/dbt-versions/dbt-cloud-release-notes) ## 🎯 Core projects - Clearer descriptions in the [Jinja functions page](/reference/dbt-jinja-functions), that improve content for each card.  -- [1.5 Docs](/guides/migration/versions/upgrading-to-v1.5) have been released as an RC! +- [1.5 Docs](/docs/dbt-versions/core-upgrade/upgrading-to-v1.5) have been released as an RC! - See the beautiful [work captured in Core v 1.5](https://github.com/dbt-labs/docs.getdbt.com/issues?q=is%3Aissue+label%3A%22dbt-core+v1.5%22+is%3Aclosed). ## New 📚 Guides and ✏️ blog posts -- [Use Databricks workflows to run dbt Cloud jobs](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs) -- [Refresh Tableau workbook with extracts after a job finishes](/guides/orchestration/webhooks/zapier-refresh-tableau-workbook) -- [dbt Python Snowpark workshop/tutorial](/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark) -- [How to optimize and troubleshoot dbt Models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks) -- [The missing guide to debug() in dbt](https://docs.getdbt.com/blog/guide-to-jinja-debug) -- [dbt Squared: Leveraging dbt Core and dbt Cloud together at scale](https://docs.getdbt.com/blog/dbt-squared) -- [Audit_helper in dbt: Bringing data auditing to a higher level](https://docs.getdbt.com/blog/audit-helper-for-migration) +- [Use Databricks workflows to run dbt Cloud jobs](/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs) +- [Refresh Tableau workbook with extracts after a job finishes](/guides/zapier-refresh-tableau-workbook) +- [dbt Python Snowpark workshop/tutorial](/guides/dbt-python-snowpark) +- [How to optimize and troubleshoot dbt Models on Databricks](/guides/optimize-dbt-models-on-databricks) +- [The missing guide to debug() in dbt](/blog/guide-to-jinja-debug) +- [dbt Squared: Leveraging dbt Core and dbt Cloud together at scale](/blog/dbt-squared) +- [Audit_helper in dbt: Bringing data auditing to a higher level](/blog/audit-helper-for-migration) diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md index 613a0c02432..708d51f0a44 100644 --- a/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md @@ -8,5 +8,5 @@ tags: [Apr-2023] The Starburst (Trino compatible) connection is now generally available in dbt Cloud. This means you can now use dbt Cloud to connect with Starburst Galaxy, Starburst Enterprise, and self-hosted Trino. This feature is powered by the [`dbt-trino`](https://github.com/starburstdata/dbt-trino) adapter. -To learn more, check out our Quickstart guide for [dbt Cloud and Starburst Galaxy](https://docs.getdbt.com/quickstarts/starburst-galaxy). +To learn more, check out our Quickstart guide for [dbt Cloud and Starburst Galaxy](https://docs.getdbt.com/guides/starburst-galaxy). diff --git a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md index b11bf702330..6b6f646e40e 100644 --- a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md +++ b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md @@ -17,5 +17,5 @@ Refer to some additional info and resources to help you upgrade your dbt version - [How to upgrade dbt without fear](https://docs.getdbt.com/blog/upgrade-dbt-without-fear) - [Upgrade Q&A on breaking changes](/docs/dbt-versions/upgrade-core-in-cloud#upgrading-legacy-versions-under-10) -- [Version migration guides](/guides/migration/versions) +- [Version migration guides](/docs/dbt-versions/core-upgrade) diff --git a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md index bf3840a8b02..06abf178b8a 100644 --- a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md +++ b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md @@ -8,7 +8,7 @@ tags: [Mar-2023] dbt Labs is introducing the newest connection option in dbt Cloud: the `dbt-trino` adapter is now available in Public Preview. This allows you to connect to Starburst Galaxy, Starburst Enterprise, and self-hosted Trino from dbt Cloud. -Check out our [Quickstart for dbt Cloud and Starburst Galaxy](/quickstarts/starburst-galaxy) to explore more. +Check out our [Quickstart for dbt Cloud and Starburst Galaxy](/guides/starburst-galaxy) to explore more. ## What’s the reason users should be excited about this? diff --git a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md index 0e26d8dc628..bdc89b4abde 100644 --- a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md +++ b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md @@ -8,7 +8,7 @@ tags: [Dec-2022] To help save compute time, new jobs will no longer be triggered to run by default. When you create a new job in dbt Cloud, you can trigger the job to run by selecting **Run on schedule** and completing the desired schedule and timing information. -For more information, refer to [Job triggers](/docs/deploy/job-triggers). +For more information, refer to [Deploy jobs](/docs/deploy/deploy-jobs). diff --git a/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md b/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md index 25d5ca5205f..012615e1e4e 100644 --- a/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md +++ b/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md @@ -8,6 +8,6 @@ tags: [Nov-2022, v1.1.66.15] dbt Cloud is the easiest and most reliable way to develop and deploy a dbt project. It helps remove complexity while also giving you more features and better performance. A simpler Databricks connection experience with support for Databricks’ Unity Catalog and better modeling defaults is now available for your use. -For all the Databricks customers already using dbt Cloud with the dbt-spark adapter, you can now [migrate](https://docs.getdbt.com/guides/migration/tools/migrating-from-spark-to-databricks#migration) your connection to the [dbt-databricks adapter](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to get the benefits. [Databricks](https://www.databricks.com/blog/2022/11/17/introducing-native-high-performance-integration-dbt-cloud.html) is committed to maintaining and improving the adapter, so this integrated experience will continue to provide the best of dbt and Databricks. +For all the Databricks customers already using dbt Cloud with the dbt-spark adapter, you can now [migrate](/guides/migrate-from-spark-to-databricks) your connection to the [dbt-databricks adapter](/docs/core/connect-data-platform/databricks-setup) to get the benefits. [Databricks](https://www.databricks.com/blog/2022/11/17/introducing-native-high-performance-integration-dbt-cloud.html) is committed to maintaining and improving the adapter, so this integrated experience will continue to provide the best of dbt and Databricks. Check out our [live blog post](https://www.getdbt.com/blog/dbt-cloud-databricks-experience/) to learn more. diff --git a/website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md b/website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md index b8e15b993de..a6b68cf9d51 100644 --- a/website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md +++ b/website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md @@ -197,7 +197,7 @@ initial support for a GitLab integration and self-service RBAC configuration. ## dbt Cloud v1.1.7 [September 3, 2020] This release adds a Release Candidate for [dbt -v0.18.0](/guides/migration/versions) and +v0.18.0](/docs/dbt-versions/core-upgrade) and includes bugfixes and improvements to the Cloud IDE and job scheduler. diff --git a/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md b/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md index d143aab5ef1..e46294029ec 100644 --- a/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md +++ b/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md @@ -47,7 +47,7 @@ For more on version support and future releases, see [Understanding dbt Core ver #### Need help upgrading? -If you want more advice on how to upgrade your dbt projects, check out our [migration guides](/guides/migration/versions/) and our [upgrading Q&A page](/docs/dbt-versions/upgrade-core-in-cloud#upgrading-legacy-versions-under-10). +If you want more advice on how to upgrade your dbt projects, check out our [migration guides](/docs/dbt-versions/core-upgrade/) and our [upgrading Q&A page](/docs/dbt-versions/upgrade-core-in-cloud#upgrading-legacy-versions-under-10). ## Upgrading legacy versions under 1.0 @@ -96,7 +96,7 @@ clean-targets: - Do you have custom scripts that parse dbt artifacts? - (BigQuery only) Do you use dbt's legacy capabilities around ingestion-time-partitioned tables? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions/upgrading-to-v1.0). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade/upgrading-to-v1.0).
    @@ -109,7 +109,7 @@ If you believe your project might be affected, read more details in the migratio - Do you have custom scripts that parse dbt JSON artifacts? - (Snowflake only) Do you have custom macros or materializations that depend on using transactions, such as statement blocks with `auto_begin=True`? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade).
    @@ -123,7 +123,7 @@ If you believe your project might be affected, read more details in the migratio - Does your project use `adapter.dispatch` or the `spark_utils` package? - Do you have custom scripts that parse dbt JSON artifacts? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade).
    @@ -146,7 +146,7 @@ See **Upgrading to v0.17.latest from v0.16** below for more details. - Do you have custom scripts that parse dbt JSON artifacts? - Do you have any custom materializations? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade).
    @@ -157,7 +157,7 @@ If you believe your project might be affected, read more details in the migratio - Do you directly call `adapter_macro`? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade). @@ -235,7 +235,7 @@ models: ``` -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade). @@ -247,7 +247,7 @@ If you believe your project might be affected, read more details in the migratio - Do you use the custom `generate_schema_name` macro? - Do you use `partition_by` config for BigQuery models? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade). @@ -259,7 +259,7 @@ If you believe your project might be affected, read more details in the migratio - Do you have a custom materialization? - Do you have a macro that accesses `Relations` directly? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade).
    @@ -270,7 +270,7 @@ If you believe your project might be affected, read more details in the migratio - Do you use the custom `generate_schema_name` macro? - Do you use the `—non-destructive` flag? -If you believe your project might be affected, read more details in the migration guide [here](/guides/migration/versions). +If you believe your project might be affected, read more details in the migration guide [here](/docs/dbt-versions/core-upgrade).
    diff --git a/website/docs/docs/deploy/ci-jobs.md b/website/docs/docs/deploy/ci-jobs.md new file mode 100644 index 00000000000..149a6951fdc --- /dev/null +++ b/website/docs/docs/deploy/ci-jobs.md @@ -0,0 +1,164 @@ +--- +title: "Continuous integration jobs in dbt Cloud" +sidebar_label: "CI jobs" +description: "Learn how to create and set up CI checks to test code changes before deploying to production." +--- + +You can set up [continuous integration](/docs/deploy/continuous-integration) (CI) jobs to run when someone opens a new pull request (PR) in your dbt Git repository. By running and testing only _modified_ models, dbt Cloud ensures these jobs are as efficient and resource conscientious as possible on your data platform. + + +## Set up CI jobs {#set-up-ci-jobs} + +dbt Labs recommends that you create your CI job in a dedicated dbt Cloud [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. Having a separate environment dedicated for CI will provide better isolation between your temporary CI schema builds and your production data builds. Additionally, sometimes teams need their CI jobs to be triggered when a PR is made to a branch other than main. If your team maintains a staging branch as part of your release process, having a separate environment will allow you to set a [custom branch](/faqs/environments/custom-branch-settings) and, accordingly, the CI job in that dedicated environment will be triggered only when PRs are made to the specified custom branch. To learn more, refer to [Get started with CI tests](/guides/set-up-ci). + +### Prerequisites +- You have a dbt Cloud account. +- For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your dbt Cloud account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You must be connected using dbt Cloud’s native Git integration with [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or [Azure DevOps](/docs/cloud/git/connect-azure-devops). + - With GitLab, you need a paid or self-hosted account which includes support for GitLab webhooks and [project access tokens](https://docs.gitlab.com/ee/user/project/settings/project_access_tokens.html). With GitLab Free, merge requests will invoke CI jobs but CI status updates (success or failure of the job) will not be reported back to GitLab. + - If you previously configured your dbt project by providing a generic git URL that clones using SSH, you must reconfigure the project to connect through dbt Cloud's native integration. + + +To make CI job creation easier, many options on the **CI job** page are set to default values that dbt Labs recommends that you use. If you don't want to use the defaults, you can change them. + +1. On your deployment environment page, click **Create Job** > **Continuous Integration Job** to create a new CI job. + +2. Options in the **Job Description** section: + - **Job Name** — Specify the name for this CI job. + - **Environment** — By default, it’s set to the environment you created the CI job from. + - **Triggered by pull requests** — By default, it’s enabled. Every time a developer opens up a pull request or pushes a commit to an existing pull request, this job will get triggered to run. + - **Run on Draft Pull Request** — Enable this option if you want to also trigger the job to run every time a developer opens up a draft pull request or pushes a commit to that draft pull request. + +3. Options in the **Execution Settings** section: + - **Commands** — By default, it includes the `dbt build --select state:modified+` command. This informs dbt Cloud to build only new or changed models and their downstream dependents. Importantly, state comparison can only happen when there is a deferred environment selected to compare state to. Click **Add command** to add more [commands](/docs/deploy/job-commands) that you want to be invoked when this job runs. + - **Compare changes against an environment (Deferral)** — By default, it’s set to the **Production** environment if you created one. This option allows dbt Cloud to check the state of the code in the PR against the code running in the deferred environment, so as to only check the modified code, instead of building the full table or the entire DAG. + + :::info + Older versions of dbt Cloud only allow you to defer to a specific job instead of an environment. Deferral to a job compares state against the project code that was run in the deferred job's last successful run. While deferral to an environment is more efficient as dbt Cloud will compare against the project representation (which is stored in the `manifest.json`) of the last successful deploy job run that executed in the deferred environment. By considering _all_ [deploy jobs](/docs/deploy/deploy-jobs) that run in the deferred environment, dbt Cloud will get a more accurate, latest project representation state. + ::: + + - **Generate docs on run** — Enable this option if you want to [generate project docs](/docs/collaborate/build-and-view-your-docs) when this job runs. This option is disabled by default since most teams do not want to test doc generation on every CI check. + + + +4. (optional) Options in the **Advanced Settings** section: + - **Environment Variables** — Define [environment variables](/docs/build/environment-variables) to customize the behavior of your project when this CI job runs. You can specify that a CI job is running in a _Staging_ or _CI_ environment by setting an environment variable and modifying your project code to behave differently, depending on the context. It's common for teams to process only a subset of data for CI runs, using environment variables to branch logic in their dbt project code. + - **Target Name** — Define the [target name](/docs/build/custom-target-names). Similar to **Environment Variables**, this option lets you customize the behavior of the project. You can use this option to specify that a CI job is running in a _Staging_ or _CI_ environment by setting the target name and modifying your project code to behave differently, depending on the context. + - **Run Timeout** — Cancel this CI job if the run time exceeds the timeout value. You can use this option to help ensure that a CI check doesn't consume too much of your warehouse resources. + - **dbt Version** — By default, it’s set to inherit the [dbt version](/docs/dbt-versions/core) from the environment. dbt Labs strongly recommends that you don't change the default setting. This option to change the version at the job level is useful only when you upgrade a project to the next dbt version; otherwise, mismatched versions between the environment and job can lead to confusing behavior. + - **Threads** — By default, it’s set to 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. + - **Run source freshness** — Enable this option to invoke the `dbt source freshness` command before running this CI job. Refer to [Source freshness](/docs/deploy/source-freshness) for more details. + + + + +## Trigger a CI job with the API + +If you're not using dbt Cloud’s native Git integration with [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or [Azure DevOps](/docs/cloud/git/connect-azure-devops), you can use the [Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) to trigger a CI job to run. However, dbt Cloud will not automatically delete the temporary schema for you. This is because automatic deletion relies on incoming webhooks from Git providers, which is only available through the native integrations. + +### Prerequisites + +- You have a dbt Cloud account. +- For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your dbt Cloud account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). + + +1. Set up a CI job with the [Create Job](/dbt-cloud/api-v2#/operations/Create%20Job) API endpoint using `"job_type": ci` or from the [dbt Cloud UI](#set-up-ci-jobs). +1. Call the [Trigger Job Run](/dbt-cloud/api-v2#/operations/Trigger%20Job%20Run) API endpoint to trigger the CI job. You must include these fields to the payload: + - Provide the pull request (PR) ID with one of these fields, even if you're using a different Git provider (like Bitbucket). This can make your code less human-readable but it will _not_ affect dbt functionality. + + - `github_pull_request_id` + - `gitlab_merge_request_id` + - `azure_devops_pull_request_id`  + - Provide the `git_sha` or `git_branch` to target the correct commit or branch to run the job against. + +## Example pull requests + +The green checkmark means the dbt build and tests were successful. Clicking on the dbt Cloud section navigates you to the relevant CI run in dbt Cloud. + +### GitHub pull request example + + + +### GitLab pull request example + + + +### Azure DevOps pull request example + + + + +## Troubleshooting + +If you're experiencing any issues, review some of the common questions and answers below. + +
    + Temporary schemas aren't dropping +
    +
    If your temporary schemas aren't dropping after a PR merges or closes, this typically indicates one of these issues: +
      +
    • You have overridden the generate_schema_name macro and it isn't using dbt_cloud_pr_ as the prefix.



      To resolve this, change your macro so that the temporary PR schema name contains the required prefix. For example: +



      + ✅ Temporary PR schema name contains the prefix dbt_cloud_pr_ (like dbt_cloud_pr_123_456_marketing).

      + ❌ Temporary PR schema name doesn't contain the prefix dbt_cloud_pr_ (like marketing).

      +
    • +
      +
    • + A macro is creating a schema but there are no dbt models writing to that schema. dbt Cloud doesn't drop temporary schemas that weren't written to as a result of running a dbt model. +
    • +
    +
    +
    +
    +
    + Reconnecting your dbt project to use dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps +
    +
    If your dbt project relies the generic git clone method that clones using SSH and deploy keys to connect to your dbt repo, you need to disconnect your repo and reconnect it using the native GitHub, GitLab, or Azure DevOps integration in order to enable dbt Cloud CI.



    + First, make sure you have the native GitHub authentication, native GitLab authentication, or native Azure DevOps authentication set up depending on which git provider you use. After you have gone through those steps, go to Account Settings, select Projects and click on the project you'd like to reconnect through native GitHub, GitLab, or Azure DevOps auth. Then click on the repository link.



    + + Once you're in the repository page, select Edit and then Disconnect Repository at the bottom.

    + +

    + Confirm that you'd like to disconnect your repository. You should then see a new Configure a repository link in your old repository's place. Click through to the configuration page:

    + +

    + + Select the GitHub, GitLab, or AzureDevOps tab and reselect your repository. That should complete the setup of the project and enable you to set up a dbt Cloud CI job.
    +
    +
    +
    + Error messages that refer to schemas from previous PRs +
    +
    If you receive a schema-related error message referencing a previous PR, this is usually an indicator that you are not using a production job for your deferral and are instead using self. If the prior PR has already been merged, the prior PR's schema may have been dropped by the time the CI job for the current PR is kicked off.



    + + To fix this issue, select a production job run to defer to instead of self. +
    +
    +
    +
    + Production job runs failing at the Clone Git Repository step +
    +
    dbt Cloud can only check out commits that belong to the original repository. dbt Cloud cannot checkout commits that belong to a fork of that repository.



    + + If you receive the following error message at the Clone Git Repository step of your job run:

    + + Error message:

    + Cloning into '/tmp/jobs/123456/target'...

    + Successfully cloned repository.

    + Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...

    + Failed to checkout to specified revision.

    + git checkout e845be54e6dc72342d5a8f814c8b3316ee220312

    + fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312

    +




    + + Double-check that your PR isn't trying to merge using a commit that belongs to a fork of the repository attached to your dbt project.
    +
    +
    +
    + CI job not triggering for Virtual Private dbt users +
    +
    To trigger jobs on dbt Cloud using the API, your Git provider needs to connect to your dbt Cloud account.



    + + If you're on a Virtual Private dbt Enterprise plan using security features like ingress PrivateLink or IP Allowlisting, registering CI hooks may not be available and can cause the job to fail silently.
    +
    +
    diff --git a/website/docs/docs/deploy/continuous-integration.md b/website/docs/docs/deploy/continuous-integration.md index fbe28173ff6..0f87965aada 100644 --- a/website/docs/docs/deploy/continuous-integration.md +++ b/website/docs/docs/deploy/continuous-integration.md @@ -1,57 +1,52 @@ --- title: "Continuous integration in dbt Cloud" sidebar_label: "Continuous integration" -description: "You can set up Slim continuous integration (CI) checks to test every single change prior to deploying the code to production just like in a software development workflow." +description: "You can set up continuous integration (CI) checks to test every single change prior to deploying the code to production just like in a software development workflow." --- -To implement a continuous integration (CI) workflow in dbt Cloud, you can set up automation that tests code changes by running [Slim CI jobs](/docs/deploy/slim-ci-jobs) before merging to production. dbt Cloud tracks the state of what’s running in your production environment so, when you run a Slim CI job, only the modified data assets in your pull request (PR) and their downstream dependencies are built and tested in a staging schema. You can also view the status of the CI checks (tests) directly from within the PR; this information is posted to your Git provider as soon as a Slim CI job completes. Additionally, you can enable settings in your Git provider that allow PRs only with successful CI checks be approved for merging. +To implement a continuous integration (CI) workflow in dbt Cloud, you can set up automation that tests code changes by running [CI jobs](/docs/deploy/ci-jobs) before merging to production. dbt Cloud tracks the state of what’s running in your production environment so, when you run a CI job, only the modified data assets in your pull request (PR) and their downstream dependencies are built and tested in a staging schema. You can also view the status of the CI checks (tests) directly from within the PR; this information is posted to your Git provider as soon as a CI job completes. Additionally, you can enable settings in your Git provider that allow PRs only with successful CI checks be approved for merging. -Using Slim CI helps: +Using CI helps: - Provide increased confidence and assurances that project changes will work as expected in production. - Reduce the time it takes to push code changes to production, through build and test automation, leading to better business outcomes. - Allow organizations to make code changes in a standardized and governed way that ensure code quality without sacrificing speed. -## How Slim CI works +## How CI works -When you [set up Slim CI jobs](/docs/deploy/slim-ci-jobs#set-up-slim-ci-jobs), dbt Cloud listens for webhooks from your Git provider indicating that a new PR has been opened or updated with new commits. When dbt Cloud receives one of these webhooks, it enqueues a new run of the Slim CI job. If you want CI checks to run on each new commit, you need to mark your PR as **Ready for review** in your Git provider — draft PRs _don't_ trigger CI jobs. +When you [set up CI jobs](/docs/deploy/ci-jobs#set-up-ci-jobs), dbt Cloud listens for webhooks from your Git provider indicating that a new PR has been opened or updated with new commits. When dbt Cloud receives one of these webhooks, it enqueues a new run of the CI job. dbt Cloud builds and tests the models affected by the code change in a temporary schema, unique to the PR. This process ensures that the code builds without error and that it matches the expectations as defined by the project's dbt tests. The unique schema name follows the naming convention `dbt_cloud_pr__` (for example, `dbt_cloud_pr_1862_1704`) and can be found in the run details for the given run, as shown in the following image: -When the Slim CI run completes, you can view the run status directly from within the pull request. dbt Cloud updates the pull request in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run. The status message states whether the models and tests ran successfully or not. +When the CI run completes, you can view the run status directly from within the pull request. dbt Cloud updates the pull request in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run. The status message states whether the models and tests ran successfully or not. -dbt Cloud deletes the temporary schema from your  when you close or merge the pull request. If your project has database or schema customization using the [generate_database_name](/docs/build/custom-databases#generate_database_name) or [generate_schema_name](/docs/build/custom-schemas#how-does-dbt-generate-a-models-schema-name) macros, dbt Cloud might not drop the temporary schema from your data warehouse. For more information, refer to [Temp PR schema limitations](/docs/deploy/slim-ci-jobs#temp-pr-schema-limitations). +dbt Cloud deletes the temporary schema from your  when you close or merge the pull request. If your project has schema customization using the [generate_schema_name](/docs/build/custom-schemas#how-does-dbt-generate-a-models-schema-name) macro, dbt Cloud might not drop the temporary schema from your data warehouse. For more information, refer to [Troubleshooting](/docs/deploy/ci-jobs#troubleshooting). -## Differences between Slim CI jobs and other deployment jobs +## Differences between CI jobs and other deployment jobs -The [dbt Cloud scheduler](/docs/deploy/job-scheduler) executes Slim CI jobs differently from other deployment jobs in these important ways: +The [dbt Cloud scheduler](/docs/deploy/job-scheduler) executes CI jobs differently from other deployment jobs in these important ways: -- **Concurrent CI checks** — Slim CI runs triggered by the same dbt Cloud Slim CI job execute concurrently (in parallel), when appropriate -- **Smart cancellation of stale builds** — Automatically cancels stale, in-flight Slim CI runs when there are new commits to the PR -- **Run slot treatment** — Slim CI runs don't consume a run slot +- **Concurrent CI checks** — CI runs triggered by the same dbt Cloud CI job execute concurrently (in parallel), when appropriate +- **Smart cancellation of stale builds** — Automatically cancels stale, in-flight CI runs when there are new commits to the PR +- **Run slot treatment** — CI runs don't consume a run slot ### Concurrent CI checks -When you have teammates collaborating on the same dbt project creating pull requests on the same dbt repository, the same Slim CI job will get triggered. Since each run builds into a dedicated, temporary schema that’s tied to the pull request, dbt Cloud can safely execute Slim CI runs _concurrently_ instead of _sequentially_ (differing from what is done with deployment dbt Cloud jobs). Because no one needs to wait for one Slim CI run to finish before another one can start, with concurrent CI checks, your whole team can test and integrate dbt code faster. +When you have teammates collaborating on the same dbt project creating pull requests on the same dbt repository, the same CI job will get triggered. Since each run builds into a dedicated, temporary schema that’s tied to the pull request, dbt Cloud can safely execute CI runs _concurrently_ instead of _sequentially_ (differing from what is done with deployment dbt Cloud jobs). Because no one needs to wait for one CI run to finish before another one can start, with concurrent CI checks, your whole team can test and integrate dbt code faster. Below describes the conditions when CI checks are run concurrently and when they’re not: -- Slim CI runs with different PR numbers execute concurrently. -- Slim CI runs with the _same_ PR number and _different_ commit SHAs execute serially because they’re building into the same schema. dbt Cloud will run the latest commit and cancel any older, stale commits. For details, refer to [Smart cancellation of stale builds](#smart-cancellation). -- Slim CI runs with the same PR number and same commit SHA, originating from different dbt Cloud projects will execute jobs concurrently. This can happen when two CI jobs are set up in different dbt Cloud projects that share the same dbt repository. +- CI runs with different PR numbers execute concurrently. +- CI runs with the _same_ PR number and _different_ commit SHAs execute serially because they’re building into the same schema. dbt Cloud will run the latest commit and cancel any older, stale commits. For details, refer to [Smart cancellation of stale builds](#smart-cancellation). +- CI runs with the same PR number and same commit SHA, originating from different dbt Cloud projects will execute jobs concurrently. This can happen when two CI jobs are set up in different dbt Cloud projects that share the same dbt repository. ### Smart cancellation of stale builds {#smart-cancellation} -When you push a new commit to a PR, dbt Cloud enqueues a new Slim CI run for the latest commit and cancels any Slim CI run that is (now) stale and still in flight. This can happen when you’re pushing new commits while a CI build is still in process and not yet done. By cancelling runs in a safe and deliberate way, dbt Cloud helps improve productivity and reduce data platform spend on wasteful CI runs. - - - -### Run slot treatment - -Your Slim CI runs don't consume run slots so a CI check will never block a production run. +When you push a new commit to a PR, dbt Cloud enqueues a new CI run for the latest commit and cancels any CI run that is (now) stale and still in flight. This can happen when you’re pushing new commits while a CI build is still in process and not yet done. By cancelling runs in a safe and deliberate way, dbt Cloud helps improve productivity and reduce data platform spend on wasteful CI runs. + diff --git a/website/docs/docs/deploy/dbt-cloud-job.md b/website/docs/docs/deploy/dbt-cloud-job.md deleted file mode 100644 index fa9eead2d3b..00000000000 --- a/website/docs/docs/deploy/dbt-cloud-job.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: "dbt Cloud jobs" -id: "dbt-cloud-job" -description: "Manage, setup, and configure your dbt Cloud job using elegant job commands and triggers." -hide_table_of_contents: true -tags: ["scheduler"] ---- - -Manage, set up, and automate your dbt jobs using robust custom job settings. You can use the job scheduler to configure when and how your jobs run, helping you keep production data fresh on a timely basis. - -This portion of our documentation will go over dbt Cloud's various job settings using: - -- [Job settings](/docs/deploy/job-settings) — Intuitively navigate the user interface to create new dbt jobs or edit existing ones. -- [Job commands](/docs/deploy/job-commands) — Use job commands to configure dbt commands on a schedule. -- [Job triggers](/docs/deploy/job-triggers) — You can configure when and how dbt should run your job, such as: - * Running on scheduled days or cron schedules - * Setting up continuous integration (CI) to run when someone opens a new pull request in your dbt repository - * Using the API to trigger jobs - - - - - - - - diff --git a/website/docs/docs/deploy/deploy-environments.md b/website/docs/docs/deploy/deploy-environments.md index 553a7d2fcdd..650fdb1c28a 100644 --- a/website/docs/docs/deploy/deploy-environments.md +++ b/website/docs/docs/deploy/deploy-environments.md @@ -10,10 +10,10 @@ Deployment environments in dbt Cloud are crucial for deploying dbt jobs in produ - The warehouse connection information (including the target database/schema settings) - The version of your code to execute -A dbt Cloud project can have multiple deployment environments, providing you the flexibility and customization to tailor the execution of dbt jobs. You can use deployment environments to [create and schedule jobs](/docs/deploy/job-settings#create-and-schedule-jobs), [enable continuous integration](/docs/deploy/continuous-integration), or more based on your specific needs or requirements. +A dbt Cloud project can have multiple deployment environments, providing you the flexibility and customization to tailor the execution of dbt jobs. You can use deployment environments to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs), [enable continuous integration](/docs/deploy/continuous-integration), or more based on your specific needs or requirements. :::tip Learn how to manage dbt Cloud environments -To learn different approaches to managing dbt Cloud environments and recommendations for your organization's unique needs, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview). +To learn different approaches to managing dbt Cloud environments and recommendations for your organization's unique needs, read [dbt Cloud environment best practices](/guides/set-up-ci). ::: This page reviews the different types of environments and how to configure your deployment environment in dbt Cloud. @@ -28,17 +28,15 @@ To create a new dbt Cloud development environment, navigate to **Deploy** -> **E -### Set as production environment (Beta) - -import ExpBeta from '/snippets/_explorer-beta-banner.md'; - - +### Set as production environment In dbt Cloud, each project can have one designated deployment environment, which serves as its production environment. This production environment is _essential_ for using features like dbt Explorer and cross-project references. It acts as the source of truth for the project's production state in dbt Cloud. + + ### Semantic Layer -For Semantic Layer-eligible customers, the next section of environment settings is the Semantic Layer configurations. [The Semantic Layer setup guide](/docs/use-dbt-semantic-layer/setup-dbt-semantic-layer) has the most up-to-date setup instructions! +For Semantic Layer-eligible customers, the next section of environment settings is the Semantic Layer configurations. [The Semantic Layer setup guide](/docs/use-dbt-semantic-layer/setup-sl) has the most up-to-date setup instructions! ### Deployment connection @@ -188,8 +186,8 @@ This section allows you to determine the credentials that should be used when co ## Related docs -- [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) -- [Deploy dbt jobs](/docs/deploy/dbt-cloud-job) -- [Deploy CI jobs](/docs/deploy/continuous-integration) +- [dbt Cloud environment best practices](/guides/set-up-ci) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [CI jobs](/docs/deploy/continuous-integration) - [Delete a job or environment in dbt Cloud](/faqs/Environments/delete-environment-job) diff --git a/website/docs/docs/deploy/deploy-jobs.md b/website/docs/docs/deploy/deploy-jobs.md new file mode 100644 index 00000000000..e43020bf66e --- /dev/null +++ b/website/docs/docs/deploy/deploy-jobs.md @@ -0,0 +1,101 @@ +--- +title: "Deploy jobs" +description: "Learn how to create and schedule deploy jobs in dbt Cloud for the scheduler to run. When you run with dbt Cloud, you get built-in observability, logging, and alerting." +tags: [scheduler] +--- + +You can use deploy jobs to build production data assets. Deploy jobs make it easy to run dbt commands against a project in your cloud data platform, triggered either by schedule or events. Each job run in dbt Cloud will have an entry in the job's run history and a detailed run overview, which provides you with: + +- Job trigger type +- Commit SHA +- Environment name +- Sources and documentation info, if applicable +- Job run details, including run timing, [model timing data](#model-timing), and [artifacts](/docs/deploy/artifacts) +- Detailed run steps with logs and their run step statuses + +You can create a deploy job and configure it to run on [scheduled days and times](#schedule-days) or enter a [custom cron schedule](#custom-cron-schedules). + + +## Prerequisites + +- You must have a dbt Cloud account and [Developer seat license](/docs/cloud/manage-access/seats-and-users). If you don't, you can [sign up](https://www.getdbt.com/signup/) for a [free account](https://www.getdbt.com/pricing/). +- You must have a dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections). +- You must have [access permission](/docs/cloud/manage-access/about-user-access) to view, create, modify, or run jobs. +- You must set up a [deployment environment](/docs/deploy/deploy-environments). + +## Create and schedule jobs {#create-and-schedule-jobs} + +1. On your deployment environment page, click **Create Job** > **Deploy Job** to create a new deploy job. +2. Options in the **Job Description** section: + - **Job Name** — Specify the name for the deploy job. For example, `Daily build`. + - **Environment** — By default, it’s set to the deployment environment you created the deploy job from. +3. Options in the **Execution Settings** section: + - **Commands** — By default, it includes the `dbt build` command. Click **Add command** to add more [commands](/docs/deploy/job-commands) that you want to be invoked when the job runs. + - **Generate docs on run** — Enable this option if you want to [generate project docs](/docs/collaborate/build-and-view-your-docs) when this deploy job runs. + - **Run source freshness** — Enable this option to invoke the `dbt source freshness` command before running the deploy job. Refer to [Source freshness](/docs/deploy/source-freshness) for more details. +4. Options in the **Schedule** section: + - **Run on schedule** — Enable this option to run the deploy job on a set schedule. + - **Timing** — Specify whether to [schedule](#schedule-days) the deploy job using **Frequency** that runs the job at specific times of day, **Specific Intervals** that runs the job every specified number of hours, or **Cron Schedule** that runs the job specified using [cron syntax](#custom-cron-schedule). + - **Days of the Week** — By default, it’s set to every day when **Frequency** or **Specific Intervals** is chosen for **Timing**. + + + +5. (optional) Options in the **Advanced Settings** section: + - **Environment Variables** — Define [environment variables](/docs/build/environment-variables) to customize the behavior of your project when the deploy job runs. + - **Target Name** — Define the [target name](/docs/build/custom-target-names) to customize the behavior of your project when the deploy job runs. Environment variables and target names are often used interchangeably. + - **Run Timeout** — Cancel the deploy job if the run time exceeds the timeout value. + - **Compare changes against** — By default, it’s set to **No deferral**. Select either **Environment** or **This Job** to let dbt Cloud know what it should compare the changes against. + + :::info + Older versions of dbt Cloud only allow you to defer to a specific job instead of an environment. Deferral to a job compares state against the project code that was run in the deferred job's last successful run. While deferral to an environment is more efficient as dbt Cloud will compare against the project representation (which is stored in the `manifest.json`) of the last successful deploy job run that executed in the deferred environment. By considering _all_ deploy jobs that run in the deferred environment, dbt Cloud will get a more accurate, latest project representation state. + ::: + + - **dbt Version** — By default, it’s set to inherit the [dbt version](/docs/dbt-versions/core) from the environment. dbt Labs strongly recommends that you don't change the default setting. This option to change the version at the job level is useful only when you upgrade a project to the next dbt version; otherwise, mismatched versions between the environment and job can lead to confusing behavior. + - **Threads** — By default, it’s set to 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. + + + +### Schedule days + +To set your job's schedule, use the **Schedule Days** option to choose specific days of the week, and select customized hours or intervals. + +Under **Timing**, you can either use customizable hours for jobs that need to run frequently throughout the day or exact intervals for jobs that need to run at specific times: + +- **Every n hours** — Use this option to set how often your job runs, in hours. Enter a number between 1 and 23 to represent the interval between job runs. For example, if you set it to "every 2 hours", the job will run every 2 hours from midnight UTC. This option is useful if you need to run jobs multiple times per day at regular intervals. + +- **At exact intervals** — Use this option to set specific times when your job should run. You can enter a comma-separated list of hours (in UTC) when you want the job to run. For example, if you set it to `0,12,23,` the job will run at midnight, noon, and 11 PM UTC. This option is useful if you want your jobs to run at specific times of day and don't need them to run more frequently than once a day. + +:::info + +dbt Cloud uses [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) (UTC) and does not account for translations to your specific timezone or take into consideration daylight savings time. For example: + +- 0 means 12am (midnight) UTC +- 12 means 12pm (afternoon) UTC +- 23 means 11pm UTC + +::: + +### Custom cron schedule + +To fully customize the scheduling of your job, choose the **Custom cron schedule** option and use the cron syntax. With this syntax, you can specify the minute, hour, day of the month, month, and day of the week, allowing you to set up complex schedules like running a job on the first Monday of each month. + + + + +Use tools such as [crontab.guru](https://crontab.guru/) to generate the correct cron syntax. This tool allows you to input cron snippets and returns their plain English translations. + +Refer to the following example snippets: + + +- `0 * * * *`: Every hour, at minute 0 +- `*/5 * * * *`: Every 5 minutes +- `5 4 * * *`: At exactly 4:05 AM UTC +- `30 */4 * * *`: At minute 30 past every 4th hour (e.g. 4:30AM, 8:30AM, 12:30PM, etc., all UTC) +- `0 0 */2 * *`: At midnight UTC every other day +- `0 0 * * 1`: At midnight UTC every Monday. + +## Related docs + +- [Artifacts](/docs/deploy/artifacts) +- [Continuous integration (CI) jobs](/docs/deploy/ci-jobs) +- [Webhooks](/docs/deploy/webhooks) diff --git a/website/docs/docs/deploy/deployment-overview.md b/website/docs/docs/deploy/deployment-overview.md index dddc252211e..29934663544 100644 --- a/website/docs/docs/deploy/deployment-overview.md +++ b/website/docs/docs/deploy/deployment-overview.md @@ -4,6 +4,8 @@ id: "deployments" sidebar: "Use dbt Cloud's capabilities to seamlessly run a dbt job in production." hide_table_of_contents: true tags: ["scheduler"] +pagination_next: "docs/deploy/job-scheduler" +pagination_prev: null --- Use dbt Cloud's capabilities to seamlessly run a dbt job in production or staging environments. Rather than run dbt commands manually from the command line, you can leverage the [dbt Cloud's in-app scheduling](/docs/deploy/job-scheduler) to automate how and when you execute dbt. @@ -29,21 +31,15 @@ Learn how to use dbt Cloud's features to help your team ship timely and quality icon="dbt-bit"/> -
    - -## dbt Cloud jobs - -
    - - -

    ## Monitor jobs and alerts @@ -70,6 +60,12 @@ Learn how to use dbt Cloud's features to help your team ship timely and quality link="/docs/deploy/run-visibility" icon="dbt-bit"/> + + diff --git a/website/docs/docs/deploy/deployment-tools.md b/website/docs/docs/deploy/deployment-tools.md index 26e9e4ea317..cca2368f38a 100644 --- a/website/docs/docs/deploy/deployment-tools.md +++ b/website/docs/docs/deploy/deployment-tools.md @@ -2,9 +2,10 @@ title: "Integrate with other orchestration tools" id: "deployment-tools" sidebar_label: "Integrate with other tools" +pagination_next: null --- -Alongside [dbt Cloud](/docs/deploy/dbt-cloud-job), discover other ways to schedule and run your dbt jobs with the help of tools such as Airflow, Prefect, Dagster, automation server, Cron, and Azure Data Factory (ADF), +Alongside [dbt Cloud](/docs/deploy/jobs), discover other ways to schedule and run your dbt jobs with the help of tools such as Airflow, Prefect, Dagster, automation server, Cron, and Azure Data Factory (ADF), Build and install these tools to automate your data workflows, trigger dbt jobs (including those hosted on dbt Cloud), and enjoy a hassle-free experience, saving time and increasing efficiency. @@ -16,7 +17,7 @@ If your organization is using [Airflow](https://airflow.apache.org/), there are -Installing the [dbt Cloud Provider](https://registry.astronomer.io/providers/dbt-cloud) to orchestrate dbt Cloud jobs. This package contains multiple Hooks, Operators, and Sensors to complete various actions within dbt Cloud. +Installing the [dbt Cloud Provider](https://airflow.apache.org/docs/apache-airflow-providers-dbt-cloud/stable/index.html) to orchestrate dbt Cloud jobs. This package contains multiple Hooks, Operators, and Sensors to complete various actions within dbt Cloud. @@ -30,7 +31,7 @@ Invoking dbt Core jobs through the [BashOperator](https://registry.astronomer.io
    -For more details on both of these methods, including example implementations, check out [this guide](https://www.astronomer.io/guides/airflow-dbt). +For more details on both of these methods, including example implementations, check out [this guide](https://docs.astronomer.io/learn/airflow-dbt-cloud). ## Azure Data Factory @@ -107,7 +108,11 @@ If your organization is using [Prefect](https://www.prefect.io/), the way you wi ## Dagster -If your organization is using [Dagster](https://dagster.io/), you can use the [dagster_dbt](https://docs.dagster.io/_apidocs/libraries/dagster-dbt) library to integrate dbt commands into your pipelines. This library supports the execution of dbt through dbt Cloud, dbt CLI and the dbt RPC server. Running dbt from Dagster automatically aggregates metadata about your dbt runs. Refer to the [example pipeline](https://dagster.io/blog/dagster-dbt) for details. +If your organization is using [Dagster](https://dagster.io/), you can use the [dagster_dbt](https://docs.dagster.io/_apidocs/libraries/dagster-dbt) library to integrate dbt commands into your pipelines. This library supports the execution of dbt through dbt Cloud, dbt Core, and the dbt RPC server. Running dbt from Dagster automatically aggregates metadata about your dbt runs. Refer to the [example pipeline](https://dagster.io/blog/dagster-dbt) for details. + +## Kestra + +If your organization uses [Kestra](http://kestra.io/), you can leverage the [dbt plugin](https://kestra.io/plugins/plugin-dbt) to orchestrate dbt Cloud and dbt Core jobs. Kestra's user interface (UI) has built-in [Blueprints](https://kestra.io/docs/user-interface-guide/blueprints), providing ready-to-use workflows. Navigate to the Blueprints page in the left navigation menu and [select the dbt tag](https://demo.kestra.io/ui/blueprints/community?selectedTag=36) to find several examples of scheduling dbt Core commands and dbt Cloud jobs as part of your data pipelines. After each scheduled or ad-hoc workflow execution, the Outputs tab in the Kestra UI allows you to download and preview all dbt build artifacts. The Gantt and Topology view additionally render the metadata to visualize dependencies and runtimes of your dbt models and tests. The dbt Cloud task provides convenient links to easily navigate between Kestra and dbt Cloud UI. ## Automation servers @@ -121,14 +126,14 @@ Cron is a decent way to schedule bash commands. However, while it may seem like Use Databricks workflows to call the dbt Cloud job API, which has several benefits such as integration with other ETL processes, utilizing dbt Cloud job features, separation of concerns, and custom job triggering based on custom conditions or logic. These advantages lead to more modularity, efficient debugging, and flexibility in scheduling dbt Cloud jobs. -For more info, refer to the guide on [Databricks workflows and dbt Cloud jobs](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs). +For more info, refer to the guide on [Databricks workflows and dbt Cloud jobs](/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs). ## Related docs - [dbt Cloud plans and pricing](https://www.getdbt.com/pricing/) -- [Quickstart guides](/quickstarts) +- [Quickstart guides](/guides) - [Webhooks for your jobs](/docs/deploy/webhooks) - [Orchestration guides](https://docs.getdbt.com/guides/orchestration) - [Commands for your production deployment](https://discourse.getdbt.com/t/what-are-the-dbt-commands-you-run-in-your-production-deployment-of-dbt/366) diff --git a/website/docs/docs/deploy/job-commands.md b/website/docs/docs/deploy/job-commands.md index acdc3a00228..db284c78a05 100644 --- a/website/docs/docs/deploy/job-commands.md +++ b/website/docs/docs/deploy/job-commands.md @@ -41,8 +41,7 @@ For every job, you have the option to select the [Generate docs on run](/docs/co ### Command list -You can add or remove as many [dbt commands](/reference/dbt-commands) as necessary for every job. However, you need to have at least one dbt command. -Commands under the "CLI" tab in the [dbt Command reference doc](/reference/dbt-commands) page are meant for use in the [CLI](/docs/core/about-the-cli) only and are not available in dbt Cloud. +You can add or remove as many [dbt commands](/reference/dbt-commands) as necessary for every job. However, you need to have at least one dbt command. There are few commands listed as "dbt Core" in the [dbt Command reference doc](/reference/dbt-commands) page. This means they are meant for use in [dbt Core](/docs/core/about-dbt-core) only and are not available in dbt Cloud. :::tip Using selectors @@ -77,7 +76,6 @@ Job command failures can mean different things for different commands. Some comm ## Related docs - [Job creation best practices](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) - [dbt Command reference](/reference/dbt-commands) -- [Job triggers](/docs/deploy/job-triggers) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) - [Build and view your docs](/docs/collaborate/build-and-view-your-docs) diff --git a/website/docs/docs/deploy/job-notifications.md b/website/docs/docs/deploy/job-notifications.md index c240ca12183..548e34fc2f3 100644 --- a/website/docs/docs/deploy/job-notifications.md +++ b/website/docs/docs/deploy/job-notifications.md @@ -1,25 +1,83 @@ --- title: "Job notifications" id: "job-notifications" -description: "Set up notifications in dbt Cloud to receive Email or Slack alerts for job run status." +description: "Set up notifications in dbt Cloud to receive email or Slack alerts about job run status." --- +Set up notifications in dbt Cloud to receive email or Slack alerts when a job run succeeds, fails, or is cancelled. -Setting up notifications in dbt Cloud will allow you to receive alerts via Email or a chosen Slack channel when a job run succeeds, fails, or is cancelled. +## Email notifications -### Email +You can receive email alerts about jobs by configuring the dbt Cloud email notification settings. -There are two options for setting up email notifications. As a **user**, you can set up email notifications for yourself under your Profile. As an **admin**, you can set up notifications on behalf of your team members. +### Prerequisites +- You must be either a _developer user_ or an _account admin_ to configure email notifications in dbt Cloud. For more details, refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users). + - As a developer user, you can set up email notifications for yourself. + - As an account admin, you can set up notifications for yourself and other team members. -1. Click the gear in the top right and select **Notification settings**. +### Configure email notifications -2. **As a user:** Select **Edit** and select the type of Notification (Succeeds, Fails, or Is Cancelled) for each Job for which you would like to be notified, or +1. From the gear menu, choose **Notification settings**. +1. By default, dbt Cloud sends notifications to the email address that's in your **User profile** page. - **As an admin:** Select one or more users you'd like to set notifications for. If you only see your own name, then you might not have admin privileges. Select **Edit** and select the type of Notification (Succeeds, Fails, or Is Cancelled) for each Job for which they will be notified. + If you're an account admin, you can choose a different email address to receive notifications. Select the **Notification email** dropdown and choose another address from the list. The list includes **Internal Users** with access to the account and **External Emails** that have been added. + - To add an external email address, select the **Notification email** dropdown and choose **Add external email**. After you add the external email, it becomes available for selection in the **Notification email** dropdown list. External emails can be addresses that are outside of your dbt Cloud account and also for third-party integrations like [channels in Microsoft Teams](https://support.microsoft.com/en-us/office/tip-send-email-to-a-channel-2c17dbae-acdf-4209-a761-b463bdaaa4ca) and [PagerDuty email integration](https://support.pagerduty.com/docs/email-integration-guide). -3. Click **Save**. - + -### Slack +1. Select the **Environment** for the jobs you want to receive notifications about from the dropdown. - +1. Click **Edit** to configure the email notification settings. Choose one or more of the run statuses (**Succeeds**, **Fails**, **Is Canceled**) for each job you want to receive notifications about. + +1. When you're done with the settings, click **Save**. + + As an account admin, you can add more email recipients by choosing another **Notification email** from the dropdown, **Edit** the job notification settings, and **Save** the changes. + + To set up alerts on jobs from a different environment, select another **Environment** from the dropdown, **Edit** those job notification settings, and **Save** the changes. + + + +### Unsubscribe from email notifications +1. From the gear menu, choose **Notification settings**. +1. On the **Email notifications** page, click **Unsubscribe from all email notifications**. + +## Slack notifications + +You can receive Slack alerts about jobs by setting up the Slack integration, then configuring the dbt Cloud Slack notification settings. + +:::note +Any account admin can edit the Slack notifications but they'll be limited to configuring the channels that their Slack user has access to. If there has been a change in user roles or Slack permissions where you no longer have access to edit a configured Slack channel, please [contact support](mailto:support@getdbt.com) for assistance. +::: + +### Prerequisites +- You must be an administrator of the Slack workspace. +- You must be an account admin to configure Slack notifications in dbt Cloud. For more details, refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users). + +### Set up the Slack integration + +1. From the gear menu, select **Profile settings**. On your **User profile** page, scroll to the **Linked accounts** section. +1. In the **Linked accounts** section, find the Slack application and click **Link**. + +1. Allow dbt Labs to access the Slack workspace. If you are a member of multiple workspaces, you can select the appropriate workspace from the dropdown menu in the upper right corner. + + +### Configure Slack notifications + +1. From the gear menu, choose **Notification settings**. +1. Select **Slack notifications** in the left sidebar. +1. Select the **Notification channel** you want to receive the job run notifications from the dropdown. + +1. Select the **Environment** for the jobs you want to receive notifications about from the dropdown. +1. Click **Edit** to configure the Slack notification settings. Choose one or more of the run statuses (**Succeeds**, **Fails**, **Is Canceled**) for each job you want to receive notifications about. +1. When you're done with the settings, click **Save**. + + To send alerts to another Slack channel, select another **Notification channel** from the dropdown, **Edit** those job notification settings, and **Save** the changes. + + To set up alerts on jobs from a different environment, select another **Environment** from the dropdown, **Edit** those job notification settings, and **Save** the changes. + + + +### Disable the Slack integration + +1. From the gear menu, select **Profile settings**. On your **User profile** page, scroll to the **Linked accounts** section. +1. Find the Slack application in the **Linked accounts** section, click the trash can icon, and click **Unlink**. Channels that you configured will no longer receive Slack notifications. Channels that are configured by other account admins will continue to receive Slack notifications if they still have active Slack integrations. To migrate ownership of a Slack channel notification configuration, have another account admin edit the configuration. \ No newline at end of file diff --git a/website/docs/docs/deploy/job-scheduler.md b/website/docs/docs/deploy/job-scheduler.md index 03eeb6fb377..fba76f677a7 100644 --- a/website/docs/docs/deploy/job-scheduler.md +++ b/website/docs/docs/deploy/job-scheduler.md @@ -82,10 +82,10 @@ The scheduler prevents queue clog by canceling runs that aren't needed, ensuring -To prevent over-scheduling, users will need to take action by either refactoring the job so it runs faster or modifying its [schedule](/docs/deploy/job-triggers). +To prevent over-scheduling, users will need to take action by either refactoring the job so it runs faster or modifying its [schedule](/docs/deploy/deploy-jobs#schedule-days). ## Related docs -- [dbt Cloud architecture](/docs/cloud/about-cloud/architecture#about-dbt-cloud-architecture) +- [dbt Cloud architecture](/docs/cloud/about-cloud/architecture#dbt-cloud-features-architecture) - [Job commands](/docs/deploy/job-commands) - [Job notifications](/docs/deploy/job-notifications) - [Webhooks](/docs/deploy/webhooks) diff --git a/website/docs/docs/deploy/job-settings.md b/website/docs/docs/deploy/job-settings.md deleted file mode 100644 index 3b53880bddf..00000000000 --- a/website/docs/docs/deploy/job-settings.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Job settings" -description: "Learn how to create and schedule jobs in dbt Cloud for the scheduler to run. Jobs help you build observability into transformation workflows with the in-app scheduling, logging, and alerting." -tags: [scheduler] ---- - -Jobs make it easy to run dbt commands against a project in your cloud data platform, triggered either by schedule or events. Each job run in dbt Cloud will have a run history, run status, and a run overview, which provides you with: - -- Job trigger type -- Commit SHA -- Environment name -- Sources and documentation info -- Job run details, including run timing, [model timing data](#model-timing), and [artifacts](/docs/deploy/artifacts) -- Detailed run steps with logs and their statuses - -You can create a job and configure it to run on [scheduled days and times](/docs/deploy/job-triggers#schedule-days) or enter a [custom cron schedule](/docs/deploy/job-triggers#custom-cron-schedules). - -## Prerequisites - -- You must have a dbt Cloud account and [Developer seat license](/docs/cloud/manage-access/seats-and-users). If you don't, you can [sign up](https://www.getdbt.com/signup/) for a [free account](https://www.getdbt.com/pricing/). -- You must have a dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections). -- You must [create and schedule a dbt Cloud job](#create-and-schedule-jobs). -- You must have [access permission](/docs/cloud/manage-access/about-user-access) to view, create, modify, or run jobs. -- You must set up a [deployment environment](/docs/deploy/deploy-environments). - -## Create and schedule jobs {#create-and-schedule-jobs} - -1. Create a new job by clicking **Deploy** in the header, click **Jobs** and then **Create job**. -1. Provide a job name, for example "Hourly Customer Job". -1. Under **Environment**, add the following: - * **Environment** — Link to an existing deployment environment. - * **dbt Version** — Select the dbt [version](/docs/dbt-versions/core). dbt Labs recommends inheriting the version from the environment settings. - * **Target Name** — Define the [target name](/docs/build/custom-target-names) for any dbt cloud job to correspond to settings in your project. - * **Threads** — The default value is 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. - -1. Define [environment variables](/docs/build/environment-variables) if you want to customize the behavior of your project. - - - -5. Under **Execution Settings**, you can configure the fields needed to execute your job: - - * **Run Timeout** — Configure the number of seconds a run will execute before dbt Cloud cancels it. Setting this to 0 means it'll never time out runs for that job. - * **Defer to a previous run state** — Select a production job you want to defer to. This enables dbt Cloud to examine the artifacts from the most recent, successful run of that deferred job, enabling state comparison and rewiring of upstream dependencies to any model that doesn’t exist in the current run's schema.  - * **Generate docs on run** checkbox — Configure the job to automatically [generate project docs](/docs/collaborate/build-and-view-your-docs) each time this job runs. - * **Run on source freshness** checkbox — Configure [dbt source freshness](/docs/deploy/source-freshness) as the first step of the job without breaking subsequent steps. - * **Commands** — Add or remove [job commands](/docs/deploy/job-commands), which are specific tasks you set in your dbt Cloud jobs. - - - -6. Under the **Triggers** section, you can configure when and how dbt will trigger the job. Refer to [job triggers](/docs/deploy/job-triggers) for more details. - - * **Schedule** tab — Use the **Run on schedule** toggle to configure your job to run on [scheduled](/docs/deploy/job-triggers#schedule-days) days and time, or enter a [custom cron schedule](/docs/deploy/job-triggers#custom-cron-schedules). - * **Continuous Integration** tab — Configure [continuous integration (CI)](/docs/deploy/continuous-integration) to run when someone opens a new pull request in your dbt repository. - * **API** tab — Use the [dbt API](/docs/dbt-cloud-apis/overview) to trigger a job. - - - -7. Select **Save**, then click **Run Now** to run your job. Click the run and watch its progress under **Run history**. diff --git a/website/docs/docs/deploy/job-triggers.md b/website/docs/docs/deploy/job-triggers.md deleted file mode 100644 index cb7a1a48088..00000000000 --- a/website/docs/docs/deploy/job-triggers.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -title: "Job triggers" -id: "job-triggers" -description: "You can configure when and how dbt should run your job" ---- - -In dbt Cloud, you can use the options under **Triggers** to configure when and how dbt should [run your job](/docs/deploy/job-triggers#schedule-job): - -- **Schedule** tab — Use the **Run on schedule** toggle to configure your job to run on either [scheduled days](#schedule-days) or [custom cron-powered schedule](#custom-cron-schedule) -- **Continuous Integration (CI)** tab — Configure [continuous integration](/docs/deploy/continuous-integration) to run when someone opens a new pull request in your dbt repository -- **API** tab — Use the [API](/docs/dbt-cloud-apis/admin-cloud-api) to trigger a job or send events to other systems - - - -## Schedule jobs - -To schedule your job to run at specific days, times, and intervals: -1. Go to the specific job settings, click **Edit**, then go to the **Triggers** section -2. Go to the **Schedule** tab, and toggle **Run on schedule** -3. Use either the [scheduled days](#schedule-days) or the [custom cron-powered schedule](#custom-cron-schedule) method to customize your desired days, times, and intervals. - -### Schedule days - -To set your job's schedule, use the **Schedule Days** option to choose specific days of the week, and select customized hours or intervals. - -Under **Timing**, you can either use customizable hours for jobs that need to run frequently throughout the day or exact intervals for jobs that need to run at specific times: - -- **Every n hours** — Use this option to set how often your job runs, in hours. Enter a number between 1 and 23 to represent the interval between job runs. For example, if you set it to "every 2 hours", the job will run every 2 hours from midnight UTC. This option is useful if you need to run jobs multiple times per day at regular intervals. - -- **At exact intervals** — Use this option to set specific times when your job should run. You can enter a comma-separated list of hours (in UTC) when you want the job to run. For example, if you set it to `0,12,23,` the job will run at midnight, noon, and 11 PM UTC. This option is useful if you want your jobs to run at specific times of day and don't need them to run more frequently than once a day. - -:::info - -dbt Cloud uses [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) (UTC) and does not account for translations to your specific timezone or take into consideration daylight savings time. For example: - -- 0 means 12am (midnight) UTC -- 12 means 12pm (afternoon) UTC -- 23 means 11pm UTC - -::: - -### Custom cron schedule - -To fully customize the scheduling of your job, choose the **Custom cron schedule** option and use the "cron" syntax. With this syntax, you can specify the minute, hour, day of the month, month, and day of the week, allowing you to set up complex schedules like running a job on the first Monday of each month. - - - - -Use tools such as [crontab.guru](https://crontab.guru/) to generate the correct cron syntax. This tool allows you to input cron snippets and returns their plain English translations. - -Refer to the following example snippets: - - -- `0 * * * *`: Every hour, at minute 0 -- `*/5 * * * *`: Every 5 minutes -- `5 4 * * *`: At exactly 4:05 AM UTC -- `30 */4 * * *`: At minute 30 past every 4th hour (e.g. 4:30AM, 8:30AM, 12:30PM, etc., all UTC) -- `0 0 */2 * *`: At midnight UTC every other day -- `0 0 * * 1`: At midnight UTC every Monday. - - -## Related docs - -- [Artifacts](/docs/deploy/artifacts) -- [Build and view your docs with dbt Cloud](/docs/collaborate/build-and-view-your-docs) -- [Source freshness](/docs/deploy/source-freshness) -- [Job commands](/docs/deploy/job-commands) -- [Webhooks for your jobs](/docs/deploy/webhooks) \ No newline at end of file diff --git a/website/docs/docs/deploy/jobs.md b/website/docs/docs/deploy/jobs.md new file mode 100644 index 00000000000..e8871b48427 --- /dev/null +++ b/website/docs/docs/deploy/jobs.md @@ -0,0 +1,23 @@ +--- +title: "Jobs in dbt Cloud" +sidebar_label: "About Jobs" +description: "Learn about deploy jobs and continuous integration (CI) jobs in dbt Cloud and what their differences are." +tags: [scheduler] +pagination_next: "docs/deploy/deploy-jobs" +--- + +In dbt Cloud, there are two types of jobs: +- [Deploy jobs](/docs/deploy/deploy-jobs) — To create and set up triggers for building production data assets +- [Continuous integration (CI) jobs](/docs/deploy/continuous-integration) — To create and set up triggers for checking code changes + +Below is a comparison table that describes how deploy jobs and CI jobs behave: + +| | Deploy Jobs | CI Jobs | +| --- | --- | --- | +| Purpose | Builds production data assets. | Builds and tests new code before merging changes into production. | +| Trigger types | Triggered by a schedule or by API. | Triggered by a commit to a PR or by API. | +| Destination | Builds into a production database and schema. | Builds into a staging database and ephemeral schema, lived for the lifetime of the PR. | +| Execution mode | Runs execute sequentially, so as to not have collisions on the underlying DAG. | Runs execute in parallel to promote team velocity. | +| Efficiency run savings | Detects over-scheduled jobs and cancels unnecessary runs to avoid queue clog. | Cancels existing runs when a newer commit is pushed to avoid redundant work. | +| State comparison | Only sometimes needs to detect state. | Almost always needs to compare state against the production environment to build on modified code and its dependents. | +| Job run duration | Limit is 24 hours. | Limit is 24 hours. | \ No newline at end of file diff --git a/website/docs/docs/deploy/monitor-jobs.md b/website/docs/docs/deploy/monitor-jobs.md index c4c5fcb73a5..45156bb341c 100644 --- a/website/docs/docs/deploy/monitor-jobs.md +++ b/website/docs/docs/deploy/monitor-jobs.md @@ -3,6 +3,7 @@ title: "Monitor jobs and alerts" id: "monitor-jobs" description: "Monitor your dbt Cloud job and set up alerts to ensure seamless orchestration and optimize your data transformations" tags: ["scheduler"] +pagination_next: "docs/deploy/run-visibility" --- Monitor your dbt Cloud jobs to help identify improvement and set up alerts to proactively alert the right people or team. @@ -10,6 +11,7 @@ Monitor your dbt Cloud jobs to help identify improvement and set up alerts to pr This portion of our documentation will go over dbt Cloud's various capabilities that help you monitor your jobs and set up alerts to ensure seamless orchestration, including: - [Run visibility](/docs/deploy/run-visibility) — View your run history to help identify where improvements can be made to scheduled jobs. +- [Retry jobs](/docs/deploy/retry-jobs) — Rerun your errored jobs from start or the failure point. - [Job notifications](/docs/deploy/job-notifications) — Receive email or slack notifications when a job run succeeds, fails, or is canceled. - [Webhooks](/docs/deploy/webhooks) — Use webhooks to send events about your dbt jobs' statuses to other systems. - [Leverage artifacts](/docs/deploy/artifacts) — dbt Cloud generates and saves artifacts for your project, which it uses to power features like creating docs for your project and reporting freshness of your sources. diff --git a/website/docs/docs/deploy/retry-jobs.md b/website/docs/docs/deploy/retry-jobs.md new file mode 100644 index 00000000000..ea616121f38 --- /dev/null +++ b/website/docs/docs/deploy/retry-jobs.md @@ -0,0 +1,32 @@ +--- +title: "Retry your dbt jobs" +sidebar_label: "Retry jobs" +description: "Rerun your errored jobs from start or the failure point." +--- + +If your dbt job run completed with a status of **Error**, you can rerun it from start or from the point of failure in dbt Cloud. + +## Prerequisites + +- You have a [dbt Cloud account](https://www.getdbt.com/signup). +- You must be using [dbt version](/docs/dbt-versions/upgrade-core-in-cloud) 1.6 or newer. +- The most recent run of the job hasn't completed successfully. The latest status of the run is **Error**. + - The job command that failed in the run must be one that supports the [retry command](/reference/commands/retry). + +## Rerun an errored job + +1. Select **Deploy** from the top navigation bar and choose **Run History.** +2. Choose the job run that has errored. +3. In the **Run Summary** tab on the job’s **Run** page, expand the run step that failed. An :x: denotes the failed step. +4. Examine the error message and determine how to fix it. After you have made your changes, save and commit them to your [Git repo](/docs/collaborate/git-version-control). +5. Return to your job’s **Run** page. In the upper right corner, click **Rerun** and choose **Rerun from start** or **Rerun from failure**. + + If you chose to rerun from the failure point, a **Rerun failed steps** modal opens. The modal lists the run steps that will be invoked: the failed step and any skipped steps. To confirm these run steps, click **Rerun from failure**. The job reruns from the failed command in the previously failed run. A banner at the top of the **Run Summary** tab captures this with the message, "This run resumed execution from last failed step". + + + +## Related content +- [Retry a failed run for a job](/dbt-cloud/api-v2#/operations/Retry%20a%20failed%20run%20for%20a%20job) API endpoint +- [Run visibility](/docs/deploy/run-visibility) +- [Jobs](/docs/deploy/jobs) +- [Job commands](/docs/deploy/job-commands) \ No newline at end of file diff --git a/website/docs/docs/deploy/slim-ci-jobs.md b/website/docs/docs/deploy/slim-ci-jobs.md deleted file mode 100644 index 35fa3eff46c..00000000000 --- a/website/docs/docs/deploy/slim-ci-jobs.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -title: "Slim CI jobs in dbt Cloud" -sidebar_label: "Slim CI jobs" -description: "Learn how to create and set up Slim CI checks to test code changes before deploying to production." ---- - -You can set up Slim [continuous integration](/docs/deploy/continuous-integration) (CI) jobs to run when someone opens a new pull request in your dbt repository. By running and testing only _modified_ models — which is what _slim_ refers to — dbt Cloud ensures these jobs are as efficient and resource conscientious as possible on your data platform. - -## Prerequisites - -- You have a dbt Cloud account. - - For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). -- You must be connected using dbt Cloud’s native integration with [GitHub account](/docs/cloud/git/connect-github), [GitLab account](/docs/cloud/git/connect-gitlab), or [Azure DevOps account](/docs/cloud/git/connect-azure-devops). - - If you’re using GitLab, you must use a paid or self-hosted account which includes support for GitLab webhooks. - - If you previously configured your dbt project by providing a generic git URL that clones using SSH, you must reconfigure the project to connect through dbt Cloud's native integration. - -## Set up Slim CI jobs - -dbt Labs recommends that you create your Slim CI job in a dedicated dbt Cloud [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. Having a separate environment dedicated for CI will provide better isolation between your temporary CI schemas builds and your production data builds. Additionally, sometimes teams need their Slim CI jobs to be triggered when a PR is made to a branch other than main. If your team maintains a staging branch in your release process, having a separate environment will allow you to set a [custom branch](/faqs/environments/custom-branch-settings), and accordingly the CI job in that dedicated environment will be triggered only when PRs are made to the specified, custom branch. - -1. On your deployment environment page, click **Create One** to create a new CI job. -2. In the **Execution Settings** section: - - For the option **Defer to a previous run state**, choose whichever production job that's set to run often. If you don't see any jobs to select from the dropdown, you first need to run a production job successfully. Deferral tells dbt Cloud to compare the manifest of the current CI job against the project representation that was materialized the last time the deferred job was run successfully. By setting this option, dbt Cloud only checks the modified code and compares the changes against what’s running in production, instead of building the full table or the entire DAG. - - - - - For the option **Commands**, enter `dbt build --select state:modified+` in the field. This informs dbt Cloud to build only new or changed models and their downstream dependents. Importantly, state comparison can only happen when there is a deferred job selected to compare state to. - - -3. In the **Triggers** section, choose the **Continuous Integration** (CI) tab. Then, enable the **Run on Pull Requests** option. This configures pull requests and new commits to be a trigger for the Slim CI job. - - -## Example pull requests - -The green checkmark means the dbt build and tests were successful. Clicking on the dbt Cloud section navigates you to the relevant CI run in dbt Cloud. - -### GitHub pull request example - - - -### GitLab pull request example - - - -### Azure DevOps pull request example - - - - -## Troubleshooting - -If you're experiencing any issues, review some of the common questions and answers below. - -
    - Reconnecting your dbt project to use dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps -
    -
    If your dbt project relies the generic git clone method that clones using SSH and deploy keys to connect to your dbt repo, you need to disconnect your repo and reconnect it using the native GitHub, GitLab, or Azure DevOps integration in order to enable dbt Cloud Slim CI.



    - First, make sure you have the native GitHub authentication, native GitLab authentication, or native Azure DevOps authentication set up depending on which git provider you use. After you have gone through those steps, go to Account Settings, select Projects and click on the project you'd like to reconnect through native GitHub, GitLab, or Azure DevOps auth. Then click on the repository link.



    - - Once you're in the repository page, select Edit and then Disconnect Repository at the bottom.

    - -

    - Confirm that you'd like to disconnect your repository. You should then see a new Configure a repository link in your old repository's place. Click through to the configuration page:

    - -

    - Select the GitHub, GitLab, or AzureDevOps tab and reselect your repository. That should complete the setup of the project and enable you to set up a dbt Cloud CI job.
    -
    -
    -
    - Error messages that refer to schemas from previous PRs -
    -
    If you receive a schema-related error message referencing a previous PR, this is usually an indicator that you are not using a production job for your deferral and are instead using self. If the prior PR has already been merged, the prior PR's schema may have been dropped by the time the Slim CI job for the current PR is kicked off.



    - - To fix this issue, select a production job run to defer to instead of self. -
    -
    -
    -
    - Production job runs failing at the Clone Git Repository step -
    -
    dbt Cloud can only checkout commits that belong to the original repository. dbt Cloud _cannot_ checkout commits that belong to a fork of that repository.



    - - If you receive the following error message at the Clone Git Repository step of your job run:

    - - Error message:

    - Cloning into '/tmp/jobs/123456/target'...

    - Successfully cloned repository.

    - Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...

    - Failed to checkout to specified revision.

    - git checkout e845be54e6dc72342d5a8f814c8b3316ee220312

    - fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312

    -




    - - Double-check that your PR isn't trying to merge using a commit that belongs to a fork of the repository attached to your dbt project.
    -
    -
    -
    - CI job not triggering for Virtual Private dbt users -
    -
    To trigger jobs on dbt Cloud using the API, your Git provider needs to connect to your dbt Cloud account.



    - - If you're on a Virtual Private dbt Enterprise plan using security features like ingress PrivateLink or IP Allowlisting, registering CI hooks may not be available and can cause the job to fail silently.
    -
    -
    - -### Temp PR schema limitations - -If your temporary pull request schemas aren't dropping after a merge or close of the PR, it's likely due to the below scenarios. Open and review the toggles below for recommendations on how to resolve this: - -
    - You used dbt Cloud environment variables in your connection settings page -
    -
    To resolve this, remove environment variables in your connections settings.
    -
    -
    -
    - You have an empty/blank default schema -
    -
    To change this, edit and fill in your default schema.
    -
    -
    -
    - You have overridden the generate_schema_name macro -
    -
    To resolve this, change your macro so that the temporary PR schema name contains the default prefix and review the guidance below: -

    - • ✅ Temporary PR schema name contains the prefix dbt_cloud_pr_ (like dbt_cloud_pr_123_456_marketing)

    - • ❌ Temporary PR schema name doesn't contain the prefix dbt_cloud_pr_ (like marketing).

    -
    -
    -
    -
    - You have overridden the generate_database_name macro -
    -
    If you assume that the project's default connection is to a database named analytics, review the guidance below to resolve this: -

    - • ✅ Database remains the same as the connection default (like analytics)

    - • ❌ Database has changed from the default connection (like dev).

    -
    -
    -
    diff --git a/website/docs/docs/deploy/source-freshness.md b/website/docs/docs/deploy/source-freshness.md index 78500416c56..2f9fe6bc007 100644 --- a/website/docs/docs/deploy/source-freshness.md +++ b/website/docs/docs/deploy/source-freshness.md @@ -13,7 +13,7 @@ dbt Cloud provides a helpful interface around dbt's [source data freshness](/doc [`dbt build`](reference/commands/build) does _not_ include source freshness checks when building and testing resources in your DAG. Instead, you can use one of these common patterns for defining jobs: - Add `dbt build` to the run step to run models, tests, and so on. - Select the **Generate docs on run** checkbox to automatically [generate project docs](/docs/collaborate/build-and-view-your-docs#set-up-a-documentation-job). -- Select the **Run on source freshness** checkbox to enable [source freshness](#checkbox) as the first to step of the job. +- Select the **Run source freshness** checkbox to enable [source freshness](#checkbox) as the first step of the job. @@ -24,7 +24,7 @@ Review the following options and outcomes: | Options | Outcomes | |--------| ------- | | **Select checkbox ** | The **Run source freshness** checkbox in your **Execution Settings** will run `dbt source freshness` as the first step in your job and won't break subsequent steps if it fails. If you wanted your job dedicated *exclusively* to running freshness checks, you still need to include at least one placeholder step, such as `dbt compile`. | -| **Add as a run step** | Add the `dbt source freshness` command to a job anywhere in your list of run steps. However, if your source data is out of date — this step will "fail', and subsequent steps will not run. dbt Cloud will trigger email notifications (if configured) based on the end state of this step.

    You can create a new job to snapshot source freshness.

    If you *do not* want your models to run if your source data is out of date, then it could be a good idea to run `dbt source freshness` as the first step in your job. Otherwise, we recommend adding `dbt source freshness` as the last step in the job, or creating a separate job just for this task. | +| **Add as a run step** | Add the `dbt source freshness` command to a job anywhere in your list of run steps. However, if your source data is out of date — this step will "fail", and subsequent steps will not run. dbt Cloud will trigger email notifications (if configured) based on the end state of this step.

    You can create a new job to snapshot source freshness.

    If you *do not* want your models to run if your source data is out of date, then it could be a good idea to run `dbt source freshness` as the first step in your job. Otherwise, we recommend adding `dbt source freshness` as the last step in the job, or creating a separate job just for this task. | diff --git a/website/docs/docs/deploy/webhooks.md b/website/docs/docs/deploy/webhooks.md index 4ce089daa89..f6c766ab201 100644 --- a/website/docs/docs/deploy/webhooks.md +++ b/website/docs/docs/deploy/webhooks.md @@ -8,7 +8,7 @@ With dbt Cloud, you can create outbound webhooks to send events (notifications) A webhook is an HTTP-based callback function that allows event-driven communication between two different web applications. This allows you to get the latest information on your dbt jobs in real time. Without it, you would need to make API calls repeatedly to check if there are any updates that you need to account for (polling). Because of this, webhooks are also called _push APIs_ or _reverse APIs_ and are often used for infrastructure development. -dbt Cloud sends a JSON payload to your application's endpoint URL when your webhook is triggered. You can send a [Slack](/guides/orchestration/webhooks/zapier-slack) notification, a [Microsoft Teams](/guides/orchestration/webhooks/zapier-ms-teams) notification, [open a PagerDuty incident](/guides/orchestration/webhooks/serverless-pagerduty) when a dbt job fails, [and more](/guides/orchestration/webhooks). +dbt Cloud sends a JSON payload to your application's endpoint URL when your webhook is triggered. You can send a [Slack](/guides/zapier-slack) notification, a [Microsoft Teams](/guides/zapier-ms-teams) notification, [open a PagerDuty incident](/guides/serverless-pagerduty) when a dbt job fails. You can create webhooks for these events from the [dbt Cloud web-based UI](#create-a-webhook-subscription) and by using the [dbt Cloud API](#api-for-webhooks): @@ -18,7 +18,7 @@ You can create webhooks for these events from the [dbt Cloud web-based UI](#crea dbt Cloud retries sending each event five times. dbt Cloud keeps a log of each webhook delivery for 30 days. Every webhook has its own **Recent Deliveries** section, which lists whether a delivery was successful or failed at a glance. -A webhook in dbt Cloud has a timeout of 10 seconds. This means that if the endpoint doesn't respond within 10 seconds, the webhook processor will time out. +A webhook in dbt Cloud has a timeout of 10 seconds. This means that if the endpoint doesn't respond within 10 seconds, the webhook processor will time out. This can result in a situation where the client responds successfully after the 10 second timeout and records a success status while the dbt cloud webhooks system will interpret this as a failure. :::tip Videos If you're interested in course learning with videos, check out the [Webhooks on-demand course](https://courses.getdbt.com/courses/webhooks) from dbt Labs. @@ -31,7 +31,7 @@ You can also check out the free [dbt Fundamentals course](https://courses.getdbt - For `write` access to webhooks: - **Enterprise plan accounts** — Permission sets are the same for both API service tokens and the dbt Cloud UI. You, or the API service token, must have the [Account Admin](/docs/cloud/manage-access/enterprise-permissions#account-admin), [Admin](/docs/cloud/manage-access/enterprise-permissions#admin), or [Developer](/docs/cloud/manage-access/enterprise-permissions#developer) permission set. - **Team plan accounts** — For the dbt Cloud UI, you need to have a [Developer license](/docs/cloud/manage-access/self-service-permissions). For API service tokens, you must assign the service token to have the [Account Admin or Member](/docs/dbt-cloud-apis/service-tokens#team-plans-using-service-account-tokens) permission set. -- You have a multi-tenant deployment model in dbt Cloud. For more information, refer to [Tenancy](/docs/cloud/about-cloud/tenancy). +- You have a multi-tenant or an AWS single-tenant deployment model in dbt Cloud. For more information, refer to [Tenancy](/docs/cloud/about-cloud/tenancy). ## Create a webhook subscription {#create-a-webhook-subscription} @@ -167,7 +167,7 @@ An example of a webhook payload for an errored run: You can use the dbt Cloud API to create new webhooks that you want to subscribe to, get detailed information about your webhooks, and to manage the webhooks that are associated with your account. The following sections describe the API endpoints you can use for this. :::info Access URLs -dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. People on Enterprise plans can choose to have their account hosted in any one of these regions. This section uses `cloud.getdbt.com` (which is for North America) as part of the endpoint but your access URL might be different. For a complete list of available dbt Cloud access URLs, refer to [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. People on Enterprise plans can choose to have their account hosted in any one of these regions. For a complete list of available dbt Cloud access URLs, refer to [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). ::: ### List all webhook subscriptions @@ -175,12 +175,13 @@ List all webhooks that are available from a specific dbt Cloud account. #### Request ```shell -GET https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscriptions +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscriptions ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhooks are associated with. | #### Response sample @@ -265,11 +266,12 @@ Get detailed information about a specific webhook. #### Request ```shell -GET https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want detailed information on. | @@ -322,7 +324,7 @@ Create a new outbound webhook and specify the endpoint URL that will be subscrib #### Request sample ```shell -POST https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscriptions +POST https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscriptions ``` ```json @@ -344,6 +346,7 @@ POST https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription #### Path parameters | Name | Description | | --- | --- | +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | #### Request parameters @@ -407,7 +410,7 @@ Update the configuration details for a specific webhook. #### Request sample ```shell -PUT https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +PUT https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} ``` ```json @@ -429,6 +432,7 @@ PUT https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/ #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want to update. | @@ -491,12 +495,13 @@ Test a specific webhook. #### Request ```shell -GET https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id}/test +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id}/test ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want to test. | @@ -518,12 +523,13 @@ Delete a specific webhook. #### Request ```shell -DELETE https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +DELETE https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want to delete. | @@ -543,5 +549,5 @@ DELETE https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscripti ## Related docs - [dbt Cloud CI](/docs/deploy/continuous-integration) -- [Use dbt Cloud's webhooks with other SaaS apps](/guides/orchestration/webhooks) +- [Use dbt Cloud's webhooks with other SaaS apps](/guides) diff --git a/website/docs/docs/environments-in-dbt.md b/website/docs/docs/environments-in-dbt.md index 54eaa68f667..f0691761dd6 100644 --- a/website/docs/docs/environments-in-dbt.md +++ b/website/docs/docs/environments-in-dbt.md @@ -2,6 +2,7 @@ title: "About environments" id: "environments-in-dbt" hide_table_of_contents: true +pagination_next: null --- In software engineering, environments are used to enable engineers to develop and test code without impacting the users of their software. Typically, there are two types of environments in dbt: @@ -18,7 +19,7 @@ Configure environments to tell dbt Cloud or dbt Core how to build and execute yo @@ -32,7 +33,7 @@ Configure environments to tell dbt Cloud or dbt Core how to build and execute yo ## Related docs -- [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) +- [dbt Cloud environment best practices](/guides/set-up-ci) - [Deployment environments](/docs/deploy/deploy-environments) - [About dbt Core versions](/docs/dbt-versions/core) - [Set Environment variables in dbt Cloud](/docs/build/environment-variables#special-environment-variables) diff --git a/website/docs/docs/introduction.md b/website/docs/docs/introduction.md index c4cfd6e45ac..61cda6e1d3e 100644 --- a/website/docs/docs/introduction.md +++ b/website/docs/docs/introduction.md @@ -1,6 +1,8 @@ --- title: "What is dbt?" id: "introduction" +pagination_next: null +pagination_prev: null --- @@ -28,6 +30,7 @@ Read more about why we want to enable analysts to work more like software engine You can access dbt using dbt Core or dbt Cloud. dbt Cloud is built around dbt Core, but it also provides: - Web-based UI so it’s more accessible +- dbt Cloud-powered command line (CLI) to develop, test, version control dbt projects, and run dbt commands - Hosted environment so it’s faster to get up and running - Differentiated features, such as metadata, in-app job scheduler, observability, integrations with other tools, integrated development environment (IDE), and more. @@ -35,11 +38,12 @@ You can learn about plans and pricing on [www.getdbt.com](https://www.getdbt.com ### dbt Cloud -dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, and investigate data models all in one web-based UI. Learn more about [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) and try one of the [dbt Cloud quickstarts](/quickstarts). +dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, and investigate data models all in one web-based UI. It also natively supports developing using a command line with the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). +Learn more about [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) and try one of the [dbt Cloud quickstarts](/guides). ### dbt Core -dbt Core is an open-source tool that enables data teams to transform data using analytics engineering best practices. You can install and use dbt Core on the command line. Learn more with the [quickstart for dbt Core](/quickstarts/codespace). +dbt Core is an open-source tool that enables data teams to transform data using analytics engineering best practices. You can install and use dbt Core on the command line. Learn more with the [quickstart for dbt Core](/guides/codespace). ## The power of dbt @@ -58,7 +62,7 @@ As a dbt user, your main focus will be on writing models (i.e. select queries) t ### Related docs -- [Quickstarts for dbt](/quickstarts) -- [Best practice guides](/guides/best-practices) +- [Quickstarts for dbt](/guides) +- [Best practice guides](/best-practices) - [What is a dbt Project?](/docs/build/projects) - [dbt run](/docs/running-a-dbt-project/run-your-dbt-projects) diff --git a/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md b/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md index 9bd57e0b280..b3b6ffb3e45 100644 --- a/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md +++ b/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md @@ -1,14 +1,25 @@ --- title: "Run your dbt projects" id: "run-your-dbt-projects" +pagination_prev: null --- -You can run your dbt projects with [dbt Cloud](/docs/cloud/about-cloud/dbt-cloud-features) and [dbt Core](https://github.com/dbt-labs/dbt-core). dbt Cloud is a hosted application where you can develop directly from a web browser. dbt Core is an open source project where you can develop from the command line. +You can run your dbt projects with [dbt Cloud](/docs/cloud/about-cloud/dbt-cloud-features) or [dbt Core](https://github.com/dbt-labs/dbt-core): -Among other features, dbt Cloud provides a development environment to help you build, test, run, and [version control](/docs/collaborate/git-version-control) your project faster. It also includes an easier way to share your [dbt project's documentation](/docs/collaborate/build-and-view-your-docs) with your team. These development tasks are directly built into dbt Cloud for an _integrated development environment_ (IDE). Refer to [Develop in the Cloud](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) for more details. +- **dbt Cloud**: A hosted application where you can develop directly from a web browser using the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). It also natively supports developing using a command line interface, [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). Among other features, dbt Cloud provides: -With dbt Core, you can run your dbt projects from the command line. The command line interface (CLI) is available from your computer's terminal application such as Terminal and iTerm. When using the command line, you can run commands and do other work from the current working directory on your computer. Before running the dbt project from the command line, make sure you are working in your dbt project directory. Learning terminal commands such as `cd` (change directory), `ls` (list directory contents), and `pwd` (present working directory) can help you navigate the directory structure on your system. + - Development environment to help you build, test, run, and [version control](/docs/collaborate/git-version-control) your project faster. + - Share your [dbt project's documentation](/docs/collaborate/build-and-view-your-docs) with your team. + - Integrates with the dbt Cloud IDE, allowing you to run development tasks and environment in the dbt Cloud UI for a seamless experience. + - The dbt Cloud CLI to develop and run dbt commands against your dbt Cloud development environment from your local command line. + - For more details, refer to [Develop in the Cloud](/docs/cloud/about-cloud-develop). -When running your project from dbt Core or dbt Cloud, the commands you commonly use are: +- **dbt Core**: An open source project where you can develop from the [command line](/docs/core/about-dbt-core). + +The dbt Cloud CLI and dbt Core are both command line tools that enable you to run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). + +The command line is available from your computer's terminal application such as Terminal and iTerm. With the command line, you can run commands and do other work from the current working directory on your computer. Before running the dbt project from the command line, make sure you are working in your dbt project directory. Learning terminal commands such as `cd` (change directory), `ls` (list directory contents), and `pwd` (present working directory) can help you navigate the directory structure on your system. + +In dbt Cloud or dbt Core, the commands you commonly use are: - [dbt run](/reference/commands/run) — Runs the models you defined in your project - [dbt build](/reference/commands/build) — Builds and tests your selected resources such as models, seeds, snapshots, and tests @@ -20,6 +31,7 @@ For information on all dbt commands and their arguments (flags), see the [dbt co - [How we set up our computers for working on dbt projects](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) - [Model selection syntax](/reference/node-selection/syntax) +- [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) - [Cloud IDE features](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#ide-features) - [Does dbt offer extract and load functionality?](/faqs/Project/transformation-tool) - [Why does dbt compile need a data platform connection](/faqs/Warehouse/db-connection-dbt-compile) diff --git a/website/docs/docs/running-a-dbt-project/using-threads.md b/website/docs/docs/running-a-dbt-project/using-threads.md index 519ce8aab81..5eede7abc27 100644 --- a/website/docs/docs/running-a-dbt-project/using-threads.md +++ b/website/docs/docs/running-a-dbt-project/using-threads.md @@ -3,7 +3,7 @@ title: "Using threads" id: "using-threads" sidebar_label: "Use threads" description: "Understand what threads mean and how to use them." - +pagination_next: null --- When dbt runs, it creates a directed acyclic graph (DAG) of links between models. The number of threads represents the maximum number of paths through the graph dbt may work on at once – increasing the number of threads can minimize the run time of your project. @@ -18,7 +18,7 @@ Generally the optimal number of threads depends on your data warehouse and its c You can use a different number of threads than the value defined in your target by using the `--threads` option when executing a dbt command. -You will define the number of threads in your `profiles.yml` file (for CLI-users only), dbt Cloud job definition, and dbt Cloud development credentials under your profile. +You will define the number of threads in your `profiles.yml` file (for dbt Core users only), dbt Cloud job definition, and dbt Cloud development credentials under your profile. ## Related docs diff --git a/website/docs/docs/supported-data-platforms.md b/website/docs/docs/supported-data-platforms.md index a8ae33a7e0c..c0c9a30db36 100644 --- a/website/docs/docs/supported-data-platforms.md +++ b/website/docs/docs/supported-data-platforms.md @@ -4,75 +4,43 @@ id: "supported-data-platforms" sidebar_label: "Supported data platforms" description: "Connect dbt to any data platform in dbt Cloud or dbt Core, using a dedicated adapter plugin" hide_table_of_contents: true +pagination_next: "docs/connect-adapters" +pagination_prev: null --- -dbt connects to and runs SQL against your database, warehouse, lake, or query engine. These SQL-speaking platforms are collectively referred to as _data platforms_. dbt connects with data platforms by using a dedicated adapter plugin for each. Plugins are built as Python modules that dbt Core discovers if they are installed on your system. Read [What are Adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) for more info. +dbt connects to and runs SQL against your database, warehouse, lake, or query engine. These SQL-speaking platforms are collectively referred to as _data platforms_. dbt connects with data platforms by using a dedicated adapter plugin for each. Plugins are built as Python modules that dbt Core discovers if they are installed on your system. Refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide. for more info. -You can [connect](/docs/connect-adapters) to adapters and data platforms either directly in the dbt Cloud user interface (UI) or install them manually using the command line (CLI). There are two types of adapters available and to evaluate quality and maintenance, we recommend you consider their verification status. You can also [further configure](/reference/resource-configs/postgres-configs) your specific data platform to optimize performance. +You can [connect](/docs/connect-adapters) to adapters and data platforms natively in dbt Cloud or install them manually using dbt Core. -- **Verified** — dbt Labs' strict [adapter program](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter) assures users of trustworthy, tested, and regularly updated adapters for production use. Verified adapters earn a "Verified" status, providing users with trust and confidence. -- **Community** — [Community adapters](/docs/community-adapters) are open-source and maintained by community members. +You can also further customize how dbt works with your specific data platform via configuration: see [Configuring Postgres](/reference/resource-configs/postgres-configs) for an example. + +import MSCallout from '/snippets/_microsoft-adapters-soon.md'; + + + +## Types of Adapters + +There are three types of adapters available today: + +- **Verified** — [Verified adapters](verified-adapters) are those that have completed a rigorous verification process in collaboration with dbt Labs. +- **Trusted** — [Trusted adapters](trusted-adapters) are those where the adapter maintainers have agreed to meet a higher standard of quality. +- **Community** — [Community adapters](community-adapters) are open-source and maintained by community members. ### Verified adapters The following are **Verified adapters** ✓ you can connect to either in dbt Cloud or dbt Core: -
    - - - - - - - - - - - - - - - - - - - - - -
    - -
    -* Install these adapters using the CLI as they're not currently supported in dbt Cloud.
    +import AdaptersVerified from '/snippets/_adapters-verified.md'; + + + +### Trusted adapters + +The following are **Trusted adapters** ✓ you can connect to in dbt Core: + +import AdaptersTrusted from '/snippets/_adapters-trusted.md'; + + + +
    * Install these adapters using dbt Core as they're not currently supported in dbt Cloud.
    + diff --git a/website/docs/docs/trusted-adapters.md b/website/docs/docs/trusted-adapters.md new file mode 100644 index 00000000000..20d61f69575 --- /dev/null +++ b/website/docs/docs/trusted-adapters.md @@ -0,0 +1,41 @@ +--- +title: "Trusted adapters" +id: "trusted-adapters" +hide_table_of_contents: true +--- + +Trusted adapters are adapters not maintained by dbt Labs, that we feel comfortable recommending to users for use in production. + +Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a *good thing*, however it requires due diligence that wasn't required in a paid-license, closed-source software world. As a user, there are questions to answer important before taking a dependency on an open-source project. The trusted adapter designation is meant to streamline this process for end users. + +
    Considerations for depending on an open-source project + +1. Does it work? +2. Does anyone "own" the code, or is anyone liable for ensuring it works? +3. Do bugs get fixed quickly? +4. Does it stay up-to-date with new Core features? +5. Is the usage substantial enough to self-sustain? +pendency on this library? + +
    + +### Trusted adapter specifications + +Refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide for more information, particularly if you are an adapter maintainer considering having your adapter be added to the trusted list. + +### Trusted vs Verified + +The Verification program exists to highlight adapters that meets both of the following criteria: + +- the guidelines given in the Trusted program, +- formal agreements required for integration with dbt Cloud + +For more information on the Verified Adapter program, reach out the [dbt Labs partnerships team](mailto:partnerships@dbtlabs.com) + +### Trusted adapters + +The following are **Trusted adapters** ✓ you can connect to in dbt Core: + +import AdaptersTrusted from '/snippets/_adapters-trusted.md'; + + diff --git a/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md b/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md index 8c004d865bb..4f4621fa860 100644 --- a/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md +++ b/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md @@ -1,17 +1,46 @@ --- title: "Available integrations" id: avail-sl-integrations -description: "Review a wide range of partners you can integrate and query with the dbt Semantic Layer." +description: "Discover the diverse range of partners that seamlessly integrate with the powerful dbt Semantic Layer, allowing you to query and unlock valuable insights from your data ecosystem." +tags: [Semantic Layer] sidebar_label: "Available integrations" +hide_table_of_contents: true +meta: + api_name: dbt Semantic Layer APIs --- -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. + -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. -::: +There are a number of data applications that seamlessly integrate with the dbt Semantic Layer, powered by MetricFlow, from business intelligence tools to notebooks, spreadsheets, data catalogs, and more. These integrations allow you to query and unlock valuable insights from your data ecosystem. + +Use the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) to simplify metric queries, optimize your development workflow, and reduce coding. This approach also ensures data governance and consistency for data consumers. + +import AvailIntegrations from '/snippets/_sl-partner-links.md'; + + + +### Custom integration + +- You can create custom integrations using different languages and tools. We support connecting with JDBC, ADBC, and GraphQL APIs. For more info, check out [our examples on GitHub](https://github.com/dbt-labs/example-semantic-layer-clients/). +- You can also connect to tools that allow you to write SQL. These tools must meet one of the two criteria: + + - Supports a generic JDBC driver option (such as DataGrip) or + - Uses Arrow Flight SQL JDBC driver version 12.0.0 or higher. + +## Related docs + +- {frontMatter.meta.api_name} to learn how to integrate and query your metrics in downstream tools. +- [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) +- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. + + + + + +import DeprecationNotice from '/snippets/_sl-deprecation-notice.md'; + + A wide variety of data applications across the modern data stack natively integrate with the dbt Semantic Layer and dbt metrics — from Business Intelligence tools to notebooks, data catalogs, and more. @@ -19,13 +48,10 @@ The dbt Semantic Layer integrations are capable of querying dbt metrics, importi For information on the partner integrations, their documentation, and more — refer to the [dbt Semantic Layer integrations](https://www.getdbt.com/product/semantic-layer-integrations) page. - - + ## Related docs -- [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer) to learn more about the dbt Semantic Layer. -- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more information on plan availability. -- [Public Preview information](/docs/use-dbt-semantic-layer/quickstart-semantic-layer#public-preview) to understand what Public Preview for the dbt Semantic Layer means. -- [dbt Semantic Layer partner integration guide](/guides/dbt-ecosystem/sl-partner-integration-guide) for information about partner integration guidelines, product roadmap, and API connectivity. -- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) to understand best practices for designing and structuring metrics in your dbt project. +- [About the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) + + diff --git a/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md deleted file mode 100644 index 95962610f8d..00000000000 --- a/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -title: "dbt Semantic Layer" -id: dbt-semantic-layer -description: "Introducing the dbt Semantic Layer" -sidebar_label: "dbt Semantic Layer" ---- - -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. - -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. - -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. -::: - -The dbt Semantic Layer allows data teams to centrally define essential business metrics like `revenue`, `customer`, and `churn` in the modeling layer (your dbt project) for consistent self-service within downstream data tools like BI and metadata management solutions. The dbt Semantic Layer provides the flexibility to define metrics on top of your existing models and then query those metrics and models in your analysis tools of choice. - -The result? You have less duplicate coding for data teams and more consistency for data consumers. - -The dbt Semantic Layer has four main parts: - -- Define your metrics in version-controlled dbt project code using MetricFlow -- Import your metric definitions via the [Discovery API](/docs/dbt-cloud-apis/discovery-api) -- Query your metric data via the dbt Proxy Server -- Explore and analyze dbt metrics in downstream tools - - - - -### What makes the dbt Semantic Layer different? - -The dbt Semantic Layer reduces code duplication and inconsistency regarding your business metrics. By moving metric definitions out of the BI layer and into the modeling layer, data teams can feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. You can also use the dbt Semantic Layer to query models and use macros. - - -## Prerequisites -To use the dbt Semantic Layer, you’ll need to meet the following: - - - - - -## Public Preview - -The dbt Semantic Layer is currently available for Public Preview, which means: - -— **Who?** The dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise) during Public Preview. Review [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more info on plan availability. - -- Team and Enterprise accounts will be able to set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated -partner tool to import metric definition. -- Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse dbt metrics in external tools, which requires access to the Discovery API. - -— **What?** Public Previews provide early access to new features. The Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality that is not backwards compatible. dbt Labs provides support, and relevant service level objectives (SLOs) apply. We will introduce pricing for the dbt Semantic Layer alongside the General Available (GA) release (future GA date to be announced). - -— **When?** Public Preview will end once the dbt Semantic Layer is available for GA. After GA, the dbt Semantic Layer will only be available to dbt Cloud **Team** and **Enterprise** plans. - -— **Where?** Public Preview is enabled at the account level so you don’t need to worry about enabling it per user. - -## Product architecture - -The dbt Semantic Layer product architecture includes four primary components: - -| Components | Information | Developer plans | Team plans | Enterprise plans | License | -| --- | --- | :---: | :---: | :---: | --- | -| **[dbt project](/docs/build/metrics)** | Define models and metrics in dbt Core. | ✅ | ✅ | ✅ | Open source, Core | -| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | -| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅

    _* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | -| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise | - - - -dbt Semantic Layer integrations will: - -- Leverage the Discovery API to fetch a list of objects and their attributes, like metrics -- Generate a dbt-SQL statement -- Then query the SQL proxy to evaluate the results of this statement - - -## Manage metrics - -:::info 📌 - -New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. - -::: - -If you're not sure whether to define a metric in dbt or not, ask yourself the following: - -> *Is this something our teams consistently need to report on?* - -An important business metric should be: - -- Well-defined (the definition is agreed upon throughout the entire organization) -- Time-bound (able to be compared across time) - -A great example of this is **revenue** — it can be aggregated on multiple levels (weekly, monthly, etc) and is key for the broader business to understand. - -- ✅ `Monthly recurring revenue` or `Weekly active users` or `Average order value` -- ❌ `1-off experimental metric` - - -### Design and define metrics - -**Design metrics** -To read about best practices on structuring and organizing your metrics, review our [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog post first. - -**Define metrics** -You can define your metrics in `.yml` files nested under a metrics key and to design or define your own metrics in your dbt project, review the following documents:
    - -- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog to understand best practices for designing and structuring metrics in your dbt project -- [dbt metrics](docs/build/metrics) for in-depth detail on attributes, filters, how to define and query your metrics and [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics) -- [dbt Semantic Layer quickstart](/docs/use-dbt-semantic-layer/quickstart-semantic-layer) to get started -- [Understanding the components of the dbt Semantic Layer](https://docs.getdbt.com/blog/understanding-the-components-of-the-dbt-semantic-layer) blog post to see further examples - -Review our helpful metrics video below, which explains what metrics are, why they're important and how you can get started: - - - -## Related questions - -
    - How are you storing my data? -
    -
    The dbt Semantic Layer does not store, or cache, or log your data. On each query to the Semantic Layer, the resulting data passes through dbt Cloud servers where it is never stored, cached, or logged. The data from your data platform gets routed through dbt Cloud servers, to your connecting data tool.
    -
    -
    -
    - Is the dbt Semantic Layer open source? -
    -
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



    - -During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise).



    - -
      -
    • dbt Core users can define metrics in their dbt Core projects and calculate them using macros from the metrics package. To use the dbt Semantic Layer integrations, users will need to have a dbt Cloud account.


    • -
    • Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API.


    • -
    • Team and Enterprise accounts will be able to set up the Semantic Layer and Discovery API in the integrated partner tool to import metric definition.
    • -
    -
    -
    - Is there a dbt Semantic Layer discussion hub? -
    -
    Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. -
    -
    -
    -

    diff --git a/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md b/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md new file mode 100644 index 00000000000..8387e934d84 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md @@ -0,0 +1,160 @@ +--- +title: "dbt Semantic Layer" +id: dbt-sl +description: "Learn how the dbt Semantic Layer enables data teams to centrally define and query metrics." +sidebar_label: "About the dbt Semantic Layer" +tags: [Semantic Layer] +hide_table_of_contents: true +pagination_next: "docs/use-dbt-semantic-layer/quickstart-sl" +pagination_prev: null +--- + + + + + +The dbt Semantic Layer, powered by [MetricFlow](/docs/build/about-metricflow), simplifies the process of defining and using critical business metrics, like `revenue` in the modeling layer (your dbt project). By centralizing metric definitions, data teams can ensure consistent self-service access to these metrics in downstream data tools and applications. The dbt Semantic Layer eliminates duplicate coding by allowing data teams to define metrics on top of existing models and automatically handles data joins. + +Moving metric definitions out of the BI layer and into the modeling layer allows data teams to feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. + +Refer to the [Why we need a universal semantic layer](https://www.getdbt.com/blog/universal-semantic-layer/) blog post to learn more. + +## Explore the dbt Semantic Layer + + +import Features from '/snippets/_sl-plan-info.md' + + + +
    + + + + + + + + + + + +
    + +
    + + + +import DeprecationNotice from '/snippets/_sl-deprecation-notice.md'; + + + +The dbt Semantic Layer allows your data teams to centrally define essential business metrics like `revenue`, `customer`, and `churn` in the modeling layer (your dbt project) for consistent self-service within downstream data tools like BI and metadata management solutions. The dbt Semantic Layer provides the flexibility to define metrics on top of your existing models and then query those metrics and models in your analysis tools of choice. + +Resulting in less duplicate coding for data teams and more consistency for data consumers. + +The dbt Semantic Layer has these main parts: + +- Define your metrics in version-controlled dbt project code using [MetricFlow](/docs/build/about-metricflow) + * dbt_metrics is now deprecated +- Import your metric definitions using the [Discovery API](/docs/dbt-cloud-apis/discovery-api) +- Query your metric data with the dbt Proxy Server +- Explore and analyze dbt metrics in downstream tools + +### What makes the dbt Semantic Layer different? + +The dbt Semantic Layer reduces code duplication and inconsistency regarding your business metrics. By moving metric definitions out of the BI layer and into the modeling layer, your data teams can feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. You can also use the dbt Semantic Layer to query models and use macros. + + +## Prerequisites + + + + + + +## Manage metrics + +:::info 📌 + +New to dbt or metrics? Check out our [quickstart guide](/guides) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. + +::: + +If you're not sure whether to define a metric in dbt or not, ask yourself the following: + +> *Is this something our teams consistently need to report on?* + +An important business metric should be: + +- Well-defined (the definition is agreed upon throughout the entire organization) +- Time-bound (able to be compared across time) + +A great example of this is **revenue**. It can be aggregated on multiple levels (weekly, monthly, and so on) and is key for the broader business to understand. + +- ✅ `Monthly recurring revenue` or `Weekly active users` or `Average order value` +- ❌ `1-off experimental metric` + + +### Design and define metrics + +You can design and define your metrics in `.yml` files nested under a metrics key in your dbt project. For more information, refer to these docs:
    + +- [dbt metrics](docs/build/metrics) for in-depth detail on attributes, filters, how to define and query your metrics, and [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics) +- [dbt Semantic Layer quickstart](/docs/use-dbt-semantic-layer/quickstart-semantic-layer) to get started + +## Related questions + +
    + How do I migrate from the legacy Semantic Layer to the new one? +
    +
    If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
    +
    +
    + +
    + How are you storing my data? +
    +
    The dbt Semantic Layer doesn't store, cache, or log your data. On each query to the Semantic Layer, the resulting data passes through dbt Cloud servers where it's never stored, cached, or logged. The data from your data platform gets routed through dbt Cloud servers to your connecting data tool.
    +
    +
    +
    + Is the dbt Semantic Layer open source? +
    +
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL-licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



    + +During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers — Developer, Team, and Enterprise.



    + +
    +
    +
    + Is there a dbt Semantic Layer discussion hub? +
    +
    Yes, absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. +
    +
    +
    +

    +
    diff --git a/website/docs/docs/use-dbt-semantic-layer/gsheets.md b/website/docs/docs/use-dbt-semantic-layer/gsheets.md new file mode 100644 index 00000000000..cb9f4014803 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/gsheets.md @@ -0,0 +1,63 @@ +--- +title: "Google Sheets (beta)" +description: "Integrate with Google Sheets to query your metrics in a spreadsheet." +tags: [Semantic Layer] +sidebar_label: "Google Sheets (beta)" +--- + +:::info Beta functionality +Google Sheets integration with the dbt Semantic Layer is a [beta](/docs/dbt-versions/product-lifecycles#dbt-cloud) feature. +::: + +The dbt Semantic Layer offers a seamless integration with Google Sheets through a custom menu. This add-on allows you to build dbt Semantic Layer queries and return data on your metrics directly within Google Sheet. + +## Prerequisites + +- You have [configured the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) and are using dbt v1.6 or higher. +- You have a Google account with access to Google Sheets. +- You can install Google add-ons. +- You have a dbt Cloud Environment ID and a [service token](/docs/dbt-cloud-apis/service-tokens) to authenticate with from a dbt Cloud account. + +## Installing the add-on + +1. Navigate to the [dbt Semantic Layer for Sheets App](https://gsuite.google.com/marketplace/app/foo/392263010968) to install the add-on. + + - You can also find it in Google Sheets by going to [**Extensions -> Add-on -> Get add-ons**](https://support.google.com/docs/answer/2942256?hl=en&co=GENIE.Platform%3DDesktop&oco=0#zippy=%2Cinstall-add-ons%2Cinstall-an-add-on) and searching for it there. +2. After installing, open the Add-On menu and select the "dbt Semantic Layer for Sheets". This will open a custom menu to the right-hand side of your screen. +3. Authenticate with your Host, dbt Cloud Environment ID, and Service Token. +4. Start querying your metrics using the **Query Builder**. For more info on the menu functions, refer to [Custom menu functions](#custom-menu-functions). + +When querying your data with Google Sheets: + +- It returns the data to the cell you have clicked on. +- The custom menu operation has a timeout limit of six (6) minutes. +- If you're using this extension, make sure you're signed into Chrome with the same Google profile you used to set up the Add-On. Log in with one Google profile at a time as using multiple Google profiles at once might cause issues. + + +## Custom menu functions + +The custom menu provides the following capabilities: + +| Menu items | Description | +|---------------|-------------------------------------------------------| +| Metrics | Search and select metrics. | +| Group By | Search and select dimensions to group by. Dimensions are grouped by the entity of the semantic model they come from. | +| Granularity | Modify the granularity of the primary time dimension. | +| Where | Filter your data. This includes categorical and time filters. | +| Order By | Return your data ordered. | +| Limit | Set a limit for the rows of your output. | + + +## Filtering data + +To use the filter functionality, choose the [dimension](docs/build/dimensions) you want to filter by and select the operation you want to filter on. + - For categorical dimensiosn, type in the dimension value you want to filter by (no quotes needed) and press enter. + - Continue adding additional filters as needed with AND and OR. If it's a time dimension, choose the operator and select from the calendar. + + + +**Limited Use Policy Disclosure** + +The dbt Semantic Layer for Sheet's use and transfer to any other app of information received from Google APIs will adhere to [Google API Services User Data Policy](https://developers.google.com/terms/api-services-user-data-policy), including the Limited Use requirements. + + diff --git a/website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md similarity index 61% rename from website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md rename to website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md index aa490511004..84e3227b4e7 100644 --- a/website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md +++ b/website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md @@ -1,45 +1,133 @@ --- -title: "Quickstart" -id: quickstart-semantic-layer -description: "Define metrics and set up the dbt Semantic Layer" -sidebar_label: "Quickstart" +title: "Get started with the dbt Semantic Layer" +id: quickstart-sl +description: "Use this guide to build and define metrics, set up the dbt Semantic Layer, and query them using the Semantic Layer APIs." +sidebar_label: "Get started with the dbt Semantic Layer" +tags: [Semantic Layer] +meta: + api_name: dbt Semantic Layer APIs --- -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. + -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new framework for defining metrics in dbt. -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. +import CreateModel from '/snippets/_sl-create-semanticmodel.md'; +import DefineMetrics from '/snippets/_sl-define-metrics.md'; +import ConfigMetric from '/snippets/_sl-configure-metricflow.md'; +import TestQuery from '/snippets/_sl-test-and-query-metrics.md'; +import ConnectQueryAPI from '/snippets/_sl-connect-and-query-api.md'; +import RunProdJob from '/snippets/_sl-run-prod-job.md'; + + +The dbt Semantic Layer, powered by [MetricFlow](/docs/build/about-metricflow), simplifies defining and using critical business metrics. It centralizes metric definitions, eliminates duplicate coding, and ensures consistent self-service access to metrics in downstream tools. + +MetricFlow, a powerful component of the dbt Semantic Layer, simplifies the creation and management of company metrics. It offers flexible abstractions, SQL query generation, and enables fast retrieval of metric datasets from a data platform. + +Use this guide to fully experience the power of the universal dbt Semantic Layer. Here are the following steps you'll take: + +- [Create a semantic model](#create-a-semantic-model) in dbt Cloud using MetricFlow +- [Define metrics](#define-metrics) in dbt Cloud using MetricFlow +- [Test and query metrics](#test-and-query-metrics) with MetricFlow +- [Run a production job](#run-a-production-job) in dbt Cloud +- [Set up dbt Semantic Layer](#setup) in dbt Cloud +- [Connect and query API](#connect-and-query-api) with dbt Cloud + +MetricFlow allows you to define metrics in your dbt project and query them whether in dbt Cloud or dbt Core with [MetricFlow commands](/docs/build/metricflow-commands). + +However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. + +## Prerequisites + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +:::tip +New to dbt or metrics? Try our [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) to help you get started! ::: -## Public Preview - -We're excited to announce the dbt Semantic Layer is currently available for Public Preview, which means: +## Create a semantic model + + + +## Define metrics + + + +## Test and query metrics + + + +## Run a production job + + + + + +
    + +What’s happening internally? +- Merging the code into your main branch allows dbt Cloud to pull those changes and builds the definition in the manifest produced by the run.
    +- Re-running the job in the deployment environment helps materialize the models, which the metrics depend on, in the data platform. It also makes sure that the manifest is up to date.
    +- The Semantic Layer APIs pulls in the most recent manifest and allows your integration information to extract metadata from it. +
    + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + -— **Who?** The dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise) during Public Preview. Review [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more info on plan availability. +## Connect and query API -- Team and Enterprise accounts will be able to set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated -partner tool to import metric definition. -- Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse dbt metrics in external tools, which requires access to the Discovery API. + -— **What?** Public Previews provide early access to new features. The Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality that is not backwards compatible. dbt Labs provides support, and relevant service level objectives (SLOs) apply. We will introduce pricing for the dbt Semantic Layer alongside the General Available (GA) release (future GA date to be announced). + +## FAQs + +If you're encountering some issues when defining your metrics or setting up the dbt Semantic Layer, check out a list of answers to some of the questions or problems you may be experiencing. -— **When?** Public Preview will end once the dbt Semantic Layer is available for GA. After GA, the dbt Semantic Layer will only be available to dbt Cloud **Team** and **Enterprise** plans. +
    + How do I migrate from the legacy Semantic Layer to the new one? +
    +
    If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
    +
    +
    +
    +How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
    +
    + Is the dbt Semantic Layer open source? + The dbt Semantic Layer is proprietary; however, some components of the dbt Semantic Layer are open source, such as dbt-core and MetricFlow.

    dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE, dbt Cloud CLI, or dbt Core CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users must be on a dbt Cloud Team or Enterprise plan.

    Refer to Billing for more information. +
    + + +## Next steps +- [Set up dbt Semantic Layer](docs/use-dbt-semantic-layer/setup-dbt-sl) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) +- Demo on [how to define and query metrics with MetricFlow](https://www.loom.com/share/60a76f6034b0441788d73638808e92ac?sid=861a94ac-25eb-4fd8-a310-58e159950f5a) +- [Billing](/docs/cloud/billing) + +
    -— **Where?** Public Preview is enabled at the account level so you don’t need to worry about enabling it per user. + +import DeprecationNotice from '/snippets/_sl-deprecation-notice.md'; -## Introduction + To try out the features of the dbt Semantic Layer, you first need to have a dbt project set up. This quickstart guide will lay out the following steps, and recommends a workflow that demonstrates some of its essential features: -- Install dbt metrics package +- Install dbt metrics package + * Note: this package will be deprecated very soon and we highly recommend you to use the new [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl?version=1.6), available in dbt v 1.6 or higher. - Define metrics - Query, and run metrics - Configure the dbt Semantic Layer ## Prerequisites + To use the dbt Semantic Layer, you’ll need to meet the following: @@ -49,13 +137,13 @@ To use the dbt Semantic Layer, you’ll need to meet the following: :::info 📌 -New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. +New to dbt or metrics? Check out our [quickstart guide](/guides) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. ::: ## Installing dbt metrics package -The dbt Semantic Layer supports the calculation of metrics by using the [dbt metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/). You can install the dbt metrics package in your dbt project by copying the below code blocks. +The dbt Semantic Layer supports the calculation of metrics by using the [dbt metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/). You can install the dbt metrics package in your dbt project by copying the below code blocks. @@ -77,16 +165,6 @@ packages: - - -```yml -packages: - - package: dbt-labs/metrics - version: [">=0.2.0", "<0.3.0"] -``` - - - 1. Paste the dbt metrics package code in your `packages.yml` file. 2. Run the [`dbt deps` command](/reference/commands/deps) to install the package. @@ -101,11 +179,6 @@ Review our helpful metrics video below, which explains what metrics are, why the -### Design metrics - -To read about best practices on structuring and organizing your metrics, review our [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog post first. - -### Define metrics Now that you've organized your metrics folder and files, you can define your metrics in `.yml` files nested under a `metrics` key. 1. Add the metric definitions found in the [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example to your dbt project. For example, to add an expenses metric, reference the following metrics you can define directly in your metrics folder: @@ -176,9 +249,7 @@ metrics: 2. Commit and merge the code changes that contain the metric definitions. 3. If you'd like to further design and define your own metrics, review the following documentation: - - [dbt metrics](/docs/build/metrics) will povide you in-depth detail on attributes, properties, filters, and how to define and query metrics. - - - Review [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog to understand best practices for designing and structuring metrics in your dbt project. + - [dbt metrics](/docs/build/metrics) will provide you in-depth detail on attributes, properties, filters, and how to define and query metrics. ## Develop and query metrics @@ -226,7 +297,7 @@ If you're encountering some issues when defining your metrics or setting up the
    Is the dbt Semantic Layer open source?
    -
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



    +
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL-licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open sources.



    During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise).



      @@ -295,7 +366,7 @@ The reason you're experiencing this error is because we changed the type diff --git a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md deleted file mode 100644 index 9d0c1eee752..00000000000 --- a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "Set up the dbt Semantic Layer" -id: setup-dbt-semantic-layer -description: "You can set up the dbt Semantic Layer in dbt Cloud." -sidebar_label: "Set up the dbt Semantic Layer" ---- - -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. - -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. - -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. -::: - -With the dbt Semantic Layer, you'll be able to centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. - -## Prerequisites - -Before you set up the dbt Semantic Layer, make sure you meet the following: - - - - - - - -## Set up dbt Semantic Layer - - - -
      - - -## Related docs - -- [Integrated partner tools](https://www.getdbt.com/product/semantic-layer-integrations) for info on the different integration partners and their documentation -- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) page for more information on plan availability -- [dbt metrics](/docs/build/metrics) for in-depth detail on attributes, properties, filters, and how to define and query metrics -- [dbt Server repo](https://github.com/dbt-labs/dbt-server), which is a persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations diff --git a/website/docs/docs/use-dbt-semantic-layer/setup-sl.md b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md new file mode 100644 index 00000000000..33f1f43f614 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md @@ -0,0 +1,99 @@ +--- +title: "Set up the dbt Semantic Layer" +id: setup-sl +description: "Seamlessly set up the dbt Semantic Layer in dbt Cloud using intuitive navigation." +sidebar_label: "Set up your Semantic Layer" +tags: [Semantic Layer] +--- + + + + +With the dbt Semantic Layer, you can centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. + +## Prerequisites + + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + + + + + + + +import DeprecationNotice from '/snippets/_sl-deprecation-notice.md'; + + + +With the dbt Semantic Layer, you can define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. + +## Prerequisites + + + + +## Set up dbt Semantic Layer + +:::tip +If you're using the legacy Semantic Layer, dbt Labs strongly recommends that you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the latest dbt Semantic Layer. Refer to the dedicated [migration guide](/guides/sl-migration) for more info. + +::: + + * Team and Enterprise accounts can set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated partner tool to import metric definitions. + * Developer accounts can query the Proxy Server using SQL but won't be able to browse dbt metrics in external tools, which requires access to the Discovery API. + + +1. Log in to your dbt Cloud account. +2. Go to **Account Settings**, and then **Service Tokens** to create a new [service account API token](/docs/dbt-cloud-apis/service-tokens). Save your token somewhere safe. +3. Assign permissions to service account tokens depending on the integration tool you choose. Refer to the [integration partner documentation](https://www.getdbt.com/product/semantic-layer-integrations) to determine the permission sets you need to assign. +4. Go to **Deploy** > **Environments**, and select your **Deployment** environment. +5. Click **Settings** on the top right side of the page. +6. Click **Edit** on the top right side of the page. +7. Select dbt version 1.2 or higher. +8. Toggle the Semantic Layer **On**. +9. Copy the full proxy server URL (like `https://eagle-hqya7.proxy.cloud.getdbt.com`) to connect to your [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations). +10. Use the URL in the data source configuration of the integrated partner tool. +11. Use the data platform login credentials that make sense for how the data is consumed. + +:::info📌 + +It is _not_ recommended that you use your dbt Cloud credentials due to elevated permissions. Instead, you can use your specific integration tool permissions. + +::: + +12. Set up the [Discovery API](/docs/dbt-cloud-apis/discovery-api) (Team and Enterprise accounts only) in the integrated partner tool to import the metric definitions. The [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations) will treat the dbt Server as another data source (like a data platform). This requires: + +- The account ID, environment ID, and job ID (which is visible in the job URL) +- An [API service token](/docs/dbt-cloud-apis/service-tokens) with job admin and metadata permissions +- Add the items above to the relevant fields in your integration tool + + +
      + +
      + +## Related docs + +- [Build your metrics](/docs/build/build-metrics-intro) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) +- [Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) +- [Migrate your legacy Semantic Layer](/guides/sl-migration) +- [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) diff --git a/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md new file mode 100644 index 00000000000..75a853fcbe8 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md @@ -0,0 +1,82 @@ +--- +title: "dbt Semantic Layer architecture" +id: sl-architecture +description: "dbt Semantic Layer product architecture and related questions." +sidebar_label: "Architecture" +tags: [Semantic Layer] +pagination_next: null +--- + + + + +The dbt Semantic Layer allows you to define metrics and use various interfaces to query them. The Semantic Layer does the heavy lifting to find where the queried data exists in your data platform and generates the SQL to make the request (including performing joins). + + + +## dbt Semantic Layer components + +The dbt Semantic Layer includes the following components: + + +| Components | Information | dbt Core users | Developer plans | Team plans | Enterprise plans | License | +| --- | --- | :---: | :---: | :---: | --- | +| **[MetricFlow](/docs/build/about-metricflow)** | MetricFlow in dbt allows users to centrally define their semantic models and metrics with YAML specifications. | ✅ | ✅ | ✅ | ✅ | BSL package (code is source available) | +| **MetricFlow Server**| A proprietary server that takes metric requests and generates optimized SQL for the specific data platform. | ❌ | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)| +| **Semantic Layer Gateway** | A service that passes queries to the MetricFlow server and executes the SQL generated by MetricFlow against the data platform|

      ❌ | ❌ |✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | +| **Semantic Layer APIs** | The interfaces allow users to submit metric queries using GraphQL and JDBC APIs. They also serve as the foundation for building first-class integrations with various tools. | ❌ | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)| + + +## Related questions + +
      + How do I migrate from the legacy Semantic Layer to the new one? +
      +
      If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
      +
      +
      + +
      +How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
      +
      + Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary; however, some components of the dbt Semantic Layer are open source, such as dbt-core and MetricFlow.

      dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE, dbt Cloud CLI, or dbt Core CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users must be on a dbt Cloud Team or Enterprise plan.

      Refer to Billing for more information. +
      +
      + Is there a dbt Semantic Layer discussion hub? +
      +
      Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. +
      +
      +
      + +
      + + + +import DeprecationNotice from '/snippets/_sl-deprecation-notice.md'; + + + +## Product architecture + +The dbt Semantic Layer product architecture includes four primary components: + +| Components | Information | Developer plans | Team plans | Enterprise plans | License | +| --- | --- | :---: | :---: | :---: | --- | +| **[dbt project](/docs/build/metrics)** | Define models and metrics in dbt Core.
      *Note, we will deprecate and no longer support the dbt_metrics package. | ✅ | ✅ | ✅ | Open source, Core | +| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | +| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅

      _* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | +| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | + + + +dbt Semantic Layer integrations will: + +- Leverage the Discovery API to fetch a list of objects and their attributes, like metrics +- Generate a dbt-SQL statement +- Then query the SQL proxy to evaluate the results of this statement + +
      diff --git a/website/docs/docs/use-dbt-semantic-layer/tableau.md b/website/docs/docs/use-dbt-semantic-layer/tableau.md new file mode 100644 index 00000000000..1d283023dda --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/tableau.md @@ -0,0 +1,82 @@ +--- +title: "Tableau (beta)" +description: "Use Tableau worksheets to query the dbt Semantic Layer and produce dashboards with trusted date." +tags: [Semantic Layer] +sidebar_label: "Tableau (beta)" +--- + +:::info Beta functionality +The Tableau integration with the dbt Semantic Layer is a [beta feature](/docs/dbt-versions/product-lifecycles#dbt-cloud). +::: + + +The Tableau integration allows you to use worksheets to query the Semantic Layer directly and produce your dashboards with trusted data. + +This integration provides a live connection to the dbt Semantic Layer through Tableau Desktop or Tableau Server. + +## Prerequisites + +- You have [configured the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) and are using dbt v1.6 or higher. +- You must have [Tableau Desktop](https://www.tableau.com/en-gb/products/desktop) version 2021.1 and greater or Tableau Server. + - Note that Tableau Online does not currently support custom connectors natively. If you use Tableau Online, you will only be able to access the connector in Tableau Desktop. +- Log in to Tableau Desktop (with Online or Server credentials) or a license to Tableau Server +- You need your dbt Cloud host, [Environment ID](/docs/use-dbt-semantic-layer/setup-sl#set-up-dbt-semantic-layer) and [service token](/docs/dbt-cloud-apis/service-tokens) to log in. This account should be set up with the dbt Semantic Layer. +- You must have a dbt Cloud Team or Enterprise [account](https://www.getdbt.com/pricing) and multi-tenant [deployment](/docs/cloud/about-cloud/regions-ip-addresses). (Single-Tenant coming soon) + + +## Installing the Connector + +1. Download the GitHub [connector file](https://github.com/dbt-labs/semantic-layer-tableau-connector/releases/download/v1.0.2/dbt_semantic_layer.taco) locally and add it to your default folder: + +| Operating system |Tableau Desktop | Tableau Server | +| ---------------- | -------------- | -------------- | +| Windows | `C:\Users\\[Windows User]\Documents\My Tableau Repository\Connectors` | `C:\Program Files\Tableau\Connectors` | +| Mac | `/Users/[user]/Documents/My Tableau Repository/Connectors` | Not applicable | +| Linux | `/opt/tableau/connectors` | `/opt/tableau/connectors` | + +2. Install the [JDBC driver](/docs/dbt-cloud-apis/sl-jdbc) to the folder based on your operating system: + - Windows: `C:\Program Files\Tableau\Drivers` + - Mac: `~/Library/Tableau/Drivers` + - Linux: ` /opt/tableau/tableau_driver/jdbc` +3. Open Tableau Desktop or Tableau Server and find the **dbt Semantic Layer by dbt Labs** connector on the left-hand side. You may need to restart these applications for the connector to be available. +4. Connect with your Host, Environment ID, and Service Token information dbt Cloud provides during [Semantic Layer configuration](/docs/use-dbt-semantic-layer/setup-sl#:~:text=After%20saving%20it%2C%20you%27ll%20be%20provided%20with%20the%20connection%20information%20that%20allows%20you%20to%20connect%20to%20downstream%20tools). + - In Tableau Server, the authentication screen may show "User" & "Password" instead, in which case the User is the Environment ID and the password is the Service Token. + + +## Using the integration + +1. **Authentication** — Once you authenticate, the system will direct you to the data source page with all the metrics and dimensions configured in your dbt Semantic Layer. +2. **Access worksheet** — From there, go directly to a worksheet in the bottom left-hand corner. +3. **Access metrics and dimensions** — Then, you'll find all the metrics and dimensions that are available to query on the left side of your window. + +Visit the [Tableau documentation](https://help.tableau.com/current/pro/desktop/en-us/gettingstarted_overview.htm) to learn more about how to use Tableau worksheets and dashboards. + +### Publish from Tableau Desktop to Tableau Server + +- **From Desktop to Server** — Like any Tableau workflow, you can publish your workbook from Tableau Desktop to Tableau Server. For step-by-step instructions, visit Tableau's [publishing guide](https://help.tableau.com/current/pro/desktop/en-us/publish_workbooks_share.htm). + + +## Things to note + +- All metrics use the "SUM" aggregation type, and this can't be altered. The dbt Semantic Layer controls the aggregation type and it is intentionally fixed. Keep in mind that the underlying aggregation in the dbt Semantic Layer might not be "SUM" (even though "SUM" is Tableau's default). +- Tableau surfaces all metrics and dimensions from the dbt Semantic Layer on the left-hand side. Note, that not all metrics and dimensions can be combined with one another. You will receive an error message if a particular dimension cannot be sliced with a metric (or vice versa). + - To display available metrics and dimensions, dbt Semantic Layer returns metadata for a fake table with the dimensions and metrics as 'columns' on this table. Because of this, you can't actually query this table for previews or extracts. + - Since this is treated as a table, dbt Semantic Layer can't dynamically change what is available. This means we display _all_ available metrics and dimensions even if a particular metric and dimension combination isn't available. + +- Certain Table calculations like "Totals" and "Percent Of" may not be accurate when using metrics aggregated in a non-additive way (such as count distinct) +- In any of our Semantic Layer interfaces (not only Tableau), you must include a [time dimension](/docs/build/cumulative#limitations) when working with any cumulative metric that has a time window or granularity. + +## Unsupported functionality + +The following Tableau features aren't supported at this time, however, the dbt Semantic Layer may support some of this functionality in a future release: + +- Updating the data source page +- Using "Extract" mode to view your data +- Unioning Tables +- Writing Custom SQL / Initial SQL +- Table Extensions +- Cross-Database Joins +- All functions in Analysis --> Create Calculated Field +- Filtering on a Date Part time dimension for a Cumulative metric type +- Changing your date dimension to use "Week Number" + diff --git a/website/docs/docs/verified-adapters.md b/website/docs/docs/verified-adapters.md index 9604d05391c..75c7529c247 100644 --- a/website/docs/docs/verified-adapters.md +++ b/website/docs/docs/verified-adapters.md @@ -1,30 +1,24 @@ --- title: "Verified adapters" id: "verified-adapters" +hide_table_of_contents: true --- -The dbt Labs has a rigorous verified adapter program which provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers aspects of development, documentation, user experience, and maintenance. +The dbt Labs has a rigorous verified adapter program that provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers development, documentation, user experience, and maintenance aspects. These adapters then earn a "Verified" status so that users can have a certain level of trust and expectation when they use them. The adapters also have maintainers and we recommend using the adapter's verification status to determine its quality and health. -Here's the list of the verified data platforms that can connect to dbt and its latest version. +The verification process serves as the on-ramp to integration with dbt Cloud. As such, we restrict applicants to data platform vendors with whom we are already engaged. -| dbt Cloud setup | CLI installation | latest verified version | -| ---------------- | ----------------------------------------- | ------------------------ | -| [Setup AlloyDB](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install AlloyDB](/docs/core/connect-data-platform/alloydb-setup) | (same as `dbt-postgres`) | -| Not supported | [Install Azure Synapse](/docs/core/connect-data-platform/azuresynapse-setup) | 1.3 :construction: | -| [Set up BigQuery](/docs/cloud/connect-data-platform/connect-bigquery) | [Install BigQuery](/docs/core/connect-data-platform/bigquery-setup) | 1.4 | -| [Set up Databricks ](/docs/cloud/connect-data-platform/connect-databricks)| [ Install Databricks](/docs/core/connect-data-platform/databricks-setup) | 1.4 | -| Not supported | [Install Dremio](/docs/core/connect-data-platform/dremio-setup) | 1.4 :construction: | -| [Set up Postgres](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install Postgres](/docs/core/connect-data-platform/postgres-setup) | 1.4 | -| [Set up Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install Redshift](/docs/core/connect-data-platform/redshift-setup) | 1.4 | -| [Set up Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) | [ Install Snowflake](/docs/core/connect-data-platform/snowflake-setup) | 1.4 | -| [Set up Spark](/docs/cloud/connect-data-platform/connect-apache-spark) | [Install Spark](/docs/core/connect-data-platform/spark-setup) | 1.4 | -| [Set up Starburst & Trino](/docs/cloud/connect-data-platform/connect-starburst-trino)| [Installl Starburst & Trino](/docs/core/connect-data-platform/trino-setup) | 1.4 | +To learn more, refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide. -:construction:: Verification in progress +import MSCallout from '/snippets/_microsoft-adapters-soon.md'; -To learn more, see [Verifying a new adapter](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter). + +Here are the verified data platforms that connect to dbt and its latest version. +import AdaptersVerified from '/snippets/_adapters-verified.md'; + + diff --git a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md index 76d03870478..f8daf393f9b 100644 --- a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md +++ b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md @@ -38,7 +38,7 @@ To unlock your account and select a plan, review the following guidance per plan 2. To unlock your account and continue using the Team plan, you need to enter your payment details. 3. Go to **Payment Information** and click **Edit** on the right. 4. Enter your payment details and click **Save**. -5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉 +5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉 @@ -59,7 +59,7 @@ For commonly asked billings questions, refer to the dbt Cloud [pricing page](htt
      How does billing work?
      -
      Team plans are billed monthly on the credit card used to sign up, based on developer seat count. You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings -> Billing page.



      +
      Team plans are billed monthly on the credit card used to sign up, based on [developer seat count and usage](/docs/cloud/billing). You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings > Billing page.



      Enterprise plan customers are billed annually based on the number of developer seats, as well as any additional services + features in your chosen plan.
      @@ -75,7 +75,7 @@ For commonly asked billings questions, refer to the dbt Cloud [pricing page](htt
      Can I pay by invoice?
      -
      At present, dbt Cloud Team plan payments must be made via credit card, and by default they will be billed monthly based on the number of developer seats.



      +
      Currently, dbt Cloud Team plan payments must be made with a credit card, and by default they will be billed monthly based on the number of [developer seats and usage](/docs/cloud/billing).



      We don’t have any plans to do invoicing for Team plan accounts in the near future, but we do currently support invoices for companies on the dbt Cloud Enterprise plan. Feel free to contact us to build your Enterprise pricing plan.
      diff --git a/website/docs/faqs/Accounts/payment-accepted.md b/website/docs/faqs/Accounts/payment-accepted.md index 2e26063c684..c0e949833a2 100644 --- a/website/docs/faqs/Accounts/payment-accepted.md +++ b/website/docs/faqs/Accounts/payment-accepted.md @@ -5,6 +5,6 @@ sidebar_label: 'Can I pay invoice' id: payment-accepted --- -Presently for Team plans, self-service dbt Cloud payments must be made via credit card and by default, they will be billed monthly based on the number of active developer seats. +Currently for Team plans, self-service dbt Cloud payments must be made with a credit card and by default, they will be billed monthly based on the number of [active developer seats and usage](/docs/cloud/billing). We don't have any plans to do invoicing for self-service teams in the near future, but we *do* currently support invoices for companies on the **dbt Cloud Enterprise plan.** Feel free to [contact us](https://www.getdbt.com/contact) to build your Enterprise pricing. diff --git a/website/docs/faqs/Accounts/slack.md b/website/docs/faqs/Accounts/slack.md deleted file mode 100644 index 4faa60fb09a..00000000000 --- a/website/docs/faqs/Accounts/slack.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: How do I set up Slack notifications? -description: "Instructions on how to set up slack notifications" -sidebar_label: 'How to set up Slack' -id: slack ---- - - diff --git a/website/docs/faqs/Core/install-python-compatibility.md b/website/docs/faqs/Core/install-python-compatibility.md index d24466f4990..5c536101f0c 100644 --- a/website/docs/faqs/Core/install-python-compatibility.md +++ b/website/docs/faqs/Core/install-python-compatibility.md @@ -17,18 +17,12 @@ The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3. - + The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.9, and 3.10 - - -As of v1.0, `dbt-core` is compatible with Python versions 3.7, 3.8, and 3.9. - - - Adapter plugins and their dependencies are not always compatible with the latest version of Python. For example, dbt-snowflake v0.19 is not compatible with Python 3.9, but dbt-snowflake versions 0.20+ are. New dbt minor versions will add support for new Python3 minor versions as soon as all dependencies can support it. In turn, dbt minor versions will drop support for old Python3 minor versions right before they reach [end of life](https://endoflife.date/python). diff --git a/website/docs/faqs/Docs/documenting-macros.md b/website/docs/faqs/Docs/documenting-macros.md index cbc12b988c6..9a2036cd6bf 100644 --- a/website/docs/faqs/Docs/documenting-macros.md +++ b/website/docs/faqs/Docs/documenting-macros.md @@ -5,8 +5,6 @@ sidebar_label: 'Document macros' id: documenting-macros --- -The `macros:` key is new in 0.16.0. - To document macros, use a [schema file](/reference/macro-properties) and nest the configurations under a `macros:` key ## Example diff --git a/website/docs/faqs/Environments/beta-release.md b/website/docs/faqs/Environments/beta-release.md deleted file mode 100644 index 5eef07d3510..00000000000 --- a/website/docs/faqs/Environments/beta-release.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: What is a beta release? -description: "How to try out beta features" -sidebar_label: 'What is a beta release?' -id: beta-release ---- -This is a chance to try out brand-new functionality. You get to start planning for use cases that the next minor version will unlock. We get to hear from you about unexpected behavior and nasty bugs, so that the release candidate has more polish and fewer surprises. diff --git a/website/docs/faqs/Environments/custom-branch-settings.md b/website/docs/faqs/Environments/custom-branch-settings.md index 95929d2d393..4bc4b85be02 100644 --- a/website/docs/faqs/Environments/custom-branch-settings.md +++ b/website/docs/faqs/Environments/custom-branch-settings.md @@ -1,7 +1,7 @@ --- -title: How do I use the `Custom Branch` settings in a dbt Cloud Environment? +title: How do I use the 'Custom Branch' settings in a dbt Cloud Environment? description: "Use custom code from your repository" -sidebar_label: 'Custom Branch settings' +sidebar_label: 'Custom branch settings' id: custom-branch-settings --- @@ -15,12 +15,21 @@ To specify a custom branch: ## Development -In a development environment, the default branch (commonly the `main` branch) is a read-only branch found in the IDE's connected repositories, which you can use to create development branches. Identifying a custom branch overrides this default behavior. Instead, your custom branch becomes read-only and can be used to create development branches. You will no longer be able to make commits to the custom branch from within the dbt Cloud IDE. +In a development environment, the default branch (usually named `main`) is a read-only branch in your connected repositories, which allows you to create new branches for development from it. -For example, you can use the `develop` branch of a connected repository. Edit an environment, select **Only run on a custom branch** in **General settings** , enter **develop** as the name of your custom branch. +Specifying a **Custom branch** overrides the default behavior. It makes the custom branch 'read-only' and enables you to create new development branches from it. This also means you can't edit this custom branch directly. - +Only one branch can be read-only, which means when you set up a custom branch, your `main` branch (usually read-only) becomes editable. If you want to protect the `main` branch and prevent any commits on it, you need to set up branch protection rules in your git provider settings. This ensures your `main` branch remains secure and no new commits can be made to it. + +For example, if you want to use the `develop` branch of a connected repository: + +- Go to an environment and select **Settings** to edit it +- Select **Only run on a custom branch** in **General settings** +- Enter **develop** as the name of your custom branch +- Click **Save** + + ## Deployment -When running jobs in a deployment environment, dbt will clone your project from your connected repository before executing your models. By default, dbt uses the default branch of your repository (commonly the `main` branch). To specify a different version of your project for dbt to execute during job runs in a particular environment, you can edit the Custom Branch setting as shown in the previous steps. \ No newline at end of file +When running jobs in a deployment environment, dbt will clone your project from your connected repository before executing your models. By default, dbt uses the default branch of your repository (commonly the `main` branch). To specify a different version of your project for dbt to execute during job runs in a particular environment, you can edit the Custom Branch setting as shown in the previous steps. diff --git a/website/docs/faqs/Environments/delete-environment-job.md b/website/docs/faqs/Environments/delete-environment-job.md index b649769f070..eb9ac511a7c 100644 --- a/website/docs/faqs/Environments/delete-environment-job.md +++ b/website/docs/faqs/Environments/delete-environment-job.md @@ -8,16 +8,7 @@ id: delete-environment-job To delete an environment or job in dbt Cloud, you must have a `developer` [license](/docs/cloud/manage-access/seats-and-users) and have the necessary [access permissions](/docs/cloud/manage-access/about-user-access). -:::info 📌 Delete a job first before deleting environment - -Deleting an environment doesn't automatically delete its associated job(s). If you delete an environment first without deleting the job, you won't be able to delete the job since it's without an environment. - -To completely delete your environment, you _must_: -1. First delete all jobs associated with that environment, -2. Then, delete the environment. -::: - -**Delete a job** +## Delete a job To delete a job or multiple jobs in dbt Cloud: @@ -33,11 +24,11 @@ To delete a job or multiple jobs in dbt Cloud: 5. Confirm your action in the **Confirm Delete** pop-up by clicking **Confirm Delete** in the bottom right to delete the job immediately. This action cannot be undone. However, you can create a new job with the same information if the deletion was made in error. -Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each individual job. +Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each job. -**Delete an environment** +## Delete an environment -To delete an environment in dbt Cloud: +Deleting an environment automatically deletes its associated job(s). If you want to keep those jobs, move them to a different environment first. To delete an environment in dbt Cloud: 1. Click **Deploy** on the navigation header and then click **Environments** 2. Select the Environment you want to delete. @@ -54,4 +45,4 @@ To delete an environment in dbt Cloud: Refresh your page, and the deleted environment should now be gone. If you want to delete multiple environments, you'll need to perform these steps to delete each one. -If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help. \ No newline at end of file +If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help. diff --git a/website/docs/faqs/Git/git-migration.md b/website/docs/faqs/Git/git-migration.md new file mode 100644 index 00000000000..775ae3679e3 --- /dev/null +++ b/website/docs/faqs/Git/git-migration.md @@ -0,0 +1,26 @@ +--- +title: "How to migrate git providers" +sidebar_label: "How to migrate git providers" +id: "git-migration" +hide_table_of_contents: true +description: "Learn how to migrate git providers in dbt Cloud with minimal disruption." +tags: [Git] +--- + +To migrate from one git provider to another, refer to the following steps to avoid minimal disruption: + +1. Outside of dbt Cloud, you'll need to import your existing repository into your new provider. + + As an example, if you're migrating from GitHub to Azure DevOps, you'll need to import your existing repository (GitHub) into your new git provider (Azure DevOps). For detailed steps on how to do this, refer to your git provider's documentation (Such as [GitHub](https://docs.github.com/en/migrations/importing-source-code/using-github-importer/importing-a-repository-with-github-importer), [GitLab](https://docs.gitlab.com/ee/user/project/import/repo_by_url.html), [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/repos/git/import-git-repository?view=azure-devops)) + +2. Go back to dbt Cloud and set up your [integration for the new git provider](/docs/cloud/git/connect-github), if needed. +3. Disconnect the old repository in dbt Cloud by going to **Account Settings** and then **Projects**. Click on the **Repository** link, then click **Edit** and **Disconnect**. + + + +4. On the same page, connect to the new git provider repository by clicking **Configure Repository** + - If you're using the native integration, you may need to OAuth to it. + +5. That's it, you should now be connected to the new git provider! 🎉 + +Note — As a tip, we recommend you refresh your page and dbt Cloud IDE before performing any actions. diff --git a/website/docs/faqs/Jinja/jinja-whitespace.md b/website/docs/faqs/Jinja/jinja-whitespace.md index 49ced7183b7..5e1ec3dc7ac 100644 --- a/website/docs/faqs/Jinja/jinja-whitespace.md +++ b/website/docs/faqs/Jinja/jinja-whitespace.md @@ -7,6 +7,6 @@ id: jinja-whitespace This is known as "whitespace control". -Use a minus sign (`-`, e.g. `{{- ... -}}`, `{%- ... %}`, `{#- ... -#}`) at the start or end of a block to strip whitespace before or after the block (more docs [here](https://jinja.palletsprojects.com/page/templates/#whitespace-control)). Check out the [tutorial on using Jinja](/guides/advanced/using-jinja#use-whitespace-control-to-tidy-up-compiled-code) for an example. +Use a minus sign (`-`, e.g. `{{- ... -}}`, `{%- ... %}`, `{#- ... -#}`) at the start or end of a block to strip whitespace before or after the block (more docs [here](https://jinja.palletsprojects.com/page/templates/#whitespace-control)). Check out the [tutorial on using Jinja](/guides/using-jinja#use-whitespace-control-to-tidy-up-compiled-code) for an example. Take caution: it's easy to fall down a rabbit hole when it comes to whitespace control! diff --git a/website/docs/faqs/Models/available-materializations.md b/website/docs/faqs/Models/available-materializations.md index 25ba745a2b2..bf11c92b595 100644 --- a/website/docs/faqs/Models/available-materializations.md +++ b/website/docs/faqs/Models/available-materializations.md @@ -5,6 +5,7 @@ sidebar_label: 'Materializations available' id: available-materializations --- -dbt ships with four materializations: `view`, `table`, `incremental` and `ephemeral`. Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options. +dbt ships with five materializations: `view`, `table`, `incremental`, `ephemeral` and `materialized_view`. +Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options. -You can also create your own [custom materializations](/guides/advanced/creating-new-materializations), if required however this is an advanced feature of dbt. +You can also create your own [custom materializations](/guides/create-new-materializations), if required however this is an advanced feature of dbt. diff --git a/website/docs/faqs/Models/configurable-model-path.md b/website/docs/faqs/Models/configurable-model-path.md index 6e8861a0693..c34112a5fe1 100644 --- a/website/docs/faqs/Models/configurable-model-path.md +++ b/website/docs/faqs/Models/configurable-model-path.md @@ -6,12 +6,6 @@ id: configurable-model-path --- - - -- **v1.0.0:** The config 'source-path' has been deprecated in favor of [`model-paths`](/reference/project-configs/model-paths). - - - By default, dbt expects the files defining your models to be located in the `models` subdirectory of your project. To change this, update the [model-paths](reference/project-configs/model-paths.md) configuration in your `dbt_project.yml` diff --git a/website/docs/faqs/Models/create-dependencies.md b/website/docs/faqs/Models/create-dependencies.md index 6a01aa18dca..e902d93b018 100644 --- a/website/docs/faqs/Models/create-dependencies.md +++ b/website/docs/faqs/Models/create-dependencies.md @@ -44,4 +44,4 @@ Found 2 models, 28 tests, 0 snapshots, 0 analyses, 130 macros, 0 operations, 0 s Done. PASS=2 WARN=0 ERROR=0 SKIP=0 TOTAL=2 ``` -To learn more about building a dbt project, we recommend you complete the [quickstart guide](/quickstarts). +To learn more about building a dbt project, we recommend you complete the [quickstart guide](/guides). diff --git a/website/docs/faqs/Models/reference-models-in-another-project.md b/website/docs/faqs/Models/reference-models-in-another-project.md deleted file mode 100644 index 19f3f52da31..00000000000 --- a/website/docs/faqs/Models/reference-models-in-another-project.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: How can I reference models or macros in another project? -description: "Use packages to add another project to your dbt project" -sidebar_label: 'Reference models or macros in another project' -id: reference-models-in-another-project - ---- - -You can use [packages](/docs/build/packages) to add another project to your dbt -project, including other projects you've created. Check out the [docs](/docs/build/packages) -for more information! diff --git a/website/docs/faqs/Project/docs-for-multiple-projects.md b/website/docs/faqs/Project/docs-for-multiple-projects.md deleted file mode 100644 index b7aa1452b39..00000000000 --- a/website/docs/faqs/Project/docs-for-multiple-projects.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Can I render docs for multiple projects? -description: "Using packages to render docs for multiple projects" -sidebar_label: 'Render docs for multiple projects' -id: docs-for-multiple-projects - ---- - -Yes! To do this, you'll need to create a "super project" that lists each project as a dependent [package](/docs/build/packages) in a `packages.yml` file. Then run `dbt deps` to install the projects as packages, prior to running `dbt docs generate`. - -If you are going down the route of multiple projects, be sure to check out our advice [1](https://discourse.getdbt.com/t/should-i-have-an-organisation-wide-project-a-monorepo-or-should-each-work-flow-have-their-own/666) [2](https://discourse.getdbt.com/t/how-to-configure-your-dbt-repository-one-or-many/2121) on the topic. diff --git a/website/docs/faqs/Project/example-projects.md b/website/docs/faqs/Project/example-projects.md index f59d6e56e78..cd58c8832e2 100644 --- a/website/docs/faqs/Project/example-projects.md +++ b/website/docs/faqs/Project/example-projects.md @@ -8,7 +8,7 @@ id: example-projects Yes! -* **Quickstart Tutorial:** You can build your own example dbt project in the [quickstart guide](/quickstarts) +* **Quickstart Tutorial:** You can build your own example dbt project in the [quickstart guide](/guides) * **Jaffle Shop:** A demonstration project (closely related to the tutorial) for a fictional ecommerce store ([source code](https://github.com/dbt-labs/jaffle_shop)) * **MRR Playbook:** A demonstration project that models subscription revenue ([source code](https://github.com/dbt-labs/mrr-playbook), [docs](https://www.getdbt.com/mrr-playbook/#!/overview)) * **Attribution Playbook:** A demonstration project that models marketing attribution ([source code](https://github.com/dbt-labs/attribution-playbook), [docs](https://www.getdbt.com/attribution-playbook/#!/overview)) diff --git a/website/docs/faqs/Project/multiple-resource-yml-files.md b/website/docs/faqs/Project/multiple-resource-yml-files.md index 422b7beb702..04e1702a162 100644 --- a/website/docs/faqs/Project/multiple-resource-yml-files.md +++ b/website/docs/faqs/Project/multiple-resource-yml-files.md @@ -9,4 +9,4 @@ It's up to you: - Some folks find it useful to have one file per model (or source / snapshot / seed etc) - Some find it useful to have one per directory, documenting and testing multiple models in one file -Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). +Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](/best-practices/how-we-structure/1-guide-overview). diff --git a/website/docs/faqs/Project/resource-yml-name.md b/website/docs/faqs/Project/resource-yml-name.md index 8a6ebe96134..c26cff26474 100644 --- a/website/docs/faqs/Project/resource-yml-name.md +++ b/website/docs/faqs/Project/resource-yml-name.md @@ -10,4 +10,4 @@ It's up to you! Here's a few options: - Use the same name as your directory (assuming you're using sensible names for your directories) - If you test and document one model (or seed, snapshot, macro etc.) per file, you can give it the same name as the model (or seed, snapshot, macro etc.) -Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). +Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](/best-practices/how-we-structure/1-guide-overview). diff --git a/website/docs/faqs/Project/structure-a-project.md b/website/docs/faqs/Project/structure-a-project.md index 5d73f9f25ba..a9ef53f5c8f 100644 --- a/website/docs/faqs/Project/structure-a-project.md +++ b/website/docs/faqs/Project/structure-a-project.md @@ -8,4 +8,4 @@ id: structure-a-project There's no one best way to structure a project! Every organization is unique. -If you're just getting started, check out how we (dbt Labs) [structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). +If you're just getting started, check out how we (dbt Labs) [structure our dbt projects](/best-practices/how-we-structure/1-guide-overview). diff --git a/website/docs/faqs/Project/which-schema.md b/website/docs/faqs/Project/which-schema.md index f0634ac8c85..2c21cba3c6a 100644 --- a/website/docs/faqs/Project/which-schema.md +++ b/website/docs/faqs/Project/which-schema.md @@ -7,7 +7,7 @@ id: which-schema --- By default, dbt builds models in your target schema. To change your target schema: * If you're developing in **dbt Cloud**, these are set for each user when you first use a development environment. -* If you're developing with the **dbt CLI**, this is the `schema:` parameter in your `profiles.yml` file. +* If you're developing with **dbt Core**, this is the `schema:` parameter in your `profiles.yml` file. If you wish to split your models across multiple schemas, check out the docs on [using custom schemas](/docs/build/custom-schemas). diff --git a/website/docs/faqs/Project/why-not-write-dml.md b/website/docs/faqs/Project/why-not-write-dml.md index fd2cea7d3ad..210ef4a916d 100644 --- a/website/docs/faqs/Project/why-not-write-dml.md +++ b/website/docs/faqs/Project/why-not-write-dml.md @@ -30,4 +30,4 @@ You can test your models, generate documentation, create snapshots, and more! SQL dialects tend to diverge the most in DML and DDL (rather than in `select` statements) — check out the example [here](/faqs/models/sql-dialect). By writing less SQL, it can make a migration to a new database technology easier. -If you do need to write custom DML, there are ways to do this in dbt using [custom materializations](/guides/advanced/creating-new-materializations). +If you do need to write custom DML, there are ways to do this in dbt using [custom materializations](/guides/create-new-materializations). diff --git a/website/docs/faqs/Runs/checking-logs.md b/website/docs/faqs/Runs/checking-logs.md index dbfdb6806a1..ff5e6f5cf04 100644 --- a/website/docs/faqs/Runs/checking-logs.md +++ b/website/docs/faqs/Runs/checking-logs.md @@ -10,7 +10,7 @@ To check out the SQL that dbt is running, you can look in: * dbt Cloud: * Within the run output, click on a model name, and then select "Details" -* dbt CLI: +* dbt Core: * The `target/compiled/` directory for compiled `select` statements * The `target/run/` directory for compiled `create` statements * The `logs/dbt.log` file for verbose logging. diff --git a/website/docs/faqs/Runs/failed-tests.md b/website/docs/faqs/Runs/failed-tests.md index bfee565ef61..d19023d035d 100644 --- a/website/docs/faqs/Runs/failed-tests.md +++ b/website/docs/faqs/Runs/failed-tests.md @@ -10,7 +10,7 @@ To debug a failing test, find the SQL that dbt ran by: * dbt Cloud: * Within the test output, click on the failed test, and then select "Details" -* dbt CLI: +* dbt Core: * Open the file path returned as part of the error message. * Navigate to the `target/compiled/schema_tests` directory for all compiled test queries diff --git a/website/docs/faqs/Tests/configurable-data-path.md b/website/docs/faqs/Tests/configurable-data-path.md index 7c4e92f7226..7663d2d3f11 100644 --- a/website/docs/faqs/Tests/configurable-data-path.md +++ b/website/docs/faqs/Tests/configurable-data-path.md @@ -6,12 +6,6 @@ id: configurable-data-path --- - - -- **v1.0.0:** The config 'data-paths' has been deprecated in favor of [`seed-paths`](/reference/project-configs/seed-paths). - - - By default, dbt expects your seed files to be located in the `seeds` subdirectory of your project. diff --git a/website/docs/faqs/Tests/custom-test-thresholds.md b/website/docs/faqs/Tests/custom-test-thresholds.md index 7155b39d25e..34d2eec7494 100644 --- a/website/docs/faqs/Tests/custom-test-thresholds.md +++ b/website/docs/faqs/Tests/custom-test-thresholds.md @@ -11,4 +11,4 @@ As of `v0.20.0`, you can use the `error_if` and `warn_if` configs to set custom For dbt `v0.19.0` and earlier, you could try these possible solutions: * Setting the [severity](/reference/resource-properties/tests#severity) to `warn`, or: -* Writing a [custom generic test](/guides/best-practices/writing-custom-generic-tests) that accepts a threshold argument ([example](https://discourse.getdbt.com/t/creating-an-error-threshold-for-schema-tests/966)) +* Writing a [custom generic test](/best-practices/writing-custom-generic-tests) that accepts a threshold argument ([example](https://discourse.getdbt.com/t/creating-an-error-threshold-for-schema-tests/966)) diff --git a/website/docs/faqs/Tests/testing-seeds.md b/website/docs/faqs/Tests/testing-seeds.md index 93afcab2fa4..3b1b3e0df56 100644 --- a/website/docs/faqs/Tests/testing-seeds.md +++ b/website/docs/faqs/Tests/testing-seeds.md @@ -6,8 +6,6 @@ id: testing-seeds --- -The `seeds:` key is new in 0.16.0. Prior to this, use a `models:` key instead. - To test and document seeds, use a [schema file](/reference/configs-and-properties) and nest the configurations under a `seeds:` key ## Example diff --git a/website/docs/faqs/Warehouse/bq-oauth-drive-scope.md b/website/docs/faqs/Warehouse/bq-oauth-drive-scope.md new file mode 100644 index 00000000000..ae6da82c47a --- /dev/null +++ b/website/docs/faqs/Warehouse/bq-oauth-drive-scope.md @@ -0,0 +1,8 @@ +--- +title: Why does the BigQuery OAuth application require scopes to Google Drive? +description: "Learn more about Google Drive scopes in the BigQuery OAuth application" +sidebar_label: "BigQuery OAuth Drive Scopes" +id: bq-oauth-drive-scope +--- + +BigQuery supports external tables over both personal Google Drive files and shared files. For more information, refer to [Create Google Drive external tables](https://cloud.google.com/bigquery/docs/external-data-drive). diff --git a/website/docs/faqs/Warehouse/database-privileges.md b/website/docs/faqs/Warehouse/database-privileges.md index 73e0549f130..3761b81fe67 100644 --- a/website/docs/faqs/Warehouse/database-privileges.md +++ b/website/docs/faqs/Warehouse/database-privileges.md @@ -12,8 +12,8 @@ schema¹ * read system views to generate documentation (i.e. views in `information_schema`) -On Postgres, Redshift, and Snowflake, use a series of `grants` to ensure that -your user has the correct privileges. +On Postgres, Redshift, Databricks, and Snowflake, use a series of `grants` to ensure that +your user has the correct privileges. Check out [example permissions](/reference/database-permissions/about-database-permissions) for these warehouses. On BigQuery, use the "BigQuery User" role to assign these privileges. diff --git a/website/docs/faqs/Warehouse/db-connection-dbt-compile.md b/website/docs/faqs/Warehouse/db-connection-dbt-compile.md index d8e58155b10..8017da4545b 100644 --- a/website/docs/faqs/Warehouse/db-connection-dbt-compile.md +++ b/website/docs/faqs/Warehouse/db-connection-dbt-compile.md @@ -22,7 +22,7 @@ To generate the compiled SQL for many models, dbt needs to run introspective que These introspective queries include: -- Populating the [relation cache](/guides/advanced/creating-new-materializations#update-the-relation-cache). Caching speeds up the metadata checks, including whether an [incremental model](/docs/build/incremental-models) already exists in the data platform. +- Populating the relation cache. For more information, refer to the [Create new materializations](/guides/create-new-materializations) guide. Caching speeds up the metadata checks, including whether an [incremental model](/docs/build/incremental-models) already exists in the data platform. - Resolving [macros](/docs/build/jinja-macros#macros), such as `run_query` or `dbt_utils.get_column_values` that you're using to template out your SQL. This is because dbt needs to run those queries during model SQL compilation. Without a data platform connection, dbt can't perform these introspective queries and won't be able to generate the compiled SQL needed for the next steps in the dbt workflow. You can [`parse`](/reference/commands/parse) a project and use the [`list`](/reference/commands/list) resources in the project, without an internet or data platform connection. Parsing a project is enough to produce a [manifest](/reference/artifacts/manifest-json), however, keep in mind that the written-out manifest won't include compiled SQL. diff --git a/website/docs/guides/adapter-creation.md b/website/docs/guides/adapter-creation.md new file mode 100644 index 00000000000..8a9145f0258 --- /dev/null +++ b/website/docs/guides/adapter-creation.md @@ -0,0 +1,1352 @@ +--- +title: Build, test, document, and promote adapters +id: adapter-creation +description: "Create an adapter that connects dbt to you platform, and learn how to maintain and version that adapter." +hoverSnippet: "Learn how to build, test, document, and promote adapters as well as maintaining and versioning an adapter." +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Adapter creation'] +level: 'Advanced' +recently_updated: true +--- + +## Introduction + +Adapters are an essential component of dbt. At their most basic level, they are how dbt connects with the various supported data platforms. At a higher-level, dbt Core adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt Core. + + Navigating and developing around the nuances of different databases can be daunting, but you are not alone. Visit [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel for additional help beyond the documentation. + +### All databases are not the same + +There's a tremendous amount of work that goes into creating a database. Here is a high-level list of typical database layers (from the outermost layer moving inwards): +- SQL API +- Client Library / Driver +- Server Connection Manager +- Query parser +- Query optimizer +- Runtime +- Storage Access Layer +- Storage + +There's a lot more there than just SQL as a language. Databases (and data warehouses) are so popular because you can abstract away a great deal of the complexity from your brain to the database itself. This enables you to focus more on the data. + +dbt allows for further abstraction and standardization of the outermost layers of a database (SQL API, client library, connection manager) into a framework that both: + - Opens database technology to less technical users (a large swath of a DBA's role has been automated, similar to how the vast majority of folks with websites today no longer have to be "[webmasters](https://en.wikipedia.org/wiki/Webmaster)"). + - Enables more meaningful conversations about how data warehousing should be done. + +This is where dbt adapters become critical. + +### What needs to be adapted? + +dbt adapters are responsible for _adapting_ dbt's standard functionality to a particular database. Our prototypical database and adapter are PostgreSQL and dbt-postgres, and most of our adapters are somewhat based on the functionality described in dbt-postgres. + +Connecting dbt to a new database will require a new adapter to be built or an existing adapter to be extended. + +The outermost layers of a database map roughly to the areas in which the dbt adapter framework encapsulates inter-database differences. + +### SQL API + +Even amongst ANSI-compliant databases, there are differences in the SQL grammar. +Here are some categories and examples of SQL statements that can be constructed differently: + + +| Category | Area of differences | Examples | +|----------------------------------------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Statement syntax | The use of `IF EXISTS` |
    • `IF EXISTS, DROP TABLE`
    • `DROP
    • IF EXISTS` | +| Workflow definition & semantics | Incremental updates |
    • `MERGE`
    • `DELETE; INSERT`
    • | +| Relation and column attributes/configuration | Database-specific materialization configs |
    • `DIST = ROUND_ROBIN` (Synapse)
    • `DIST = EVEN` (Redshift)
    • | +| Permissioning | Grant statements that can only take one grantee at a time vs those that accept lists of grantees |
    • `grant SELECT on table dinner.corn to corn_kid, everyone`
    • `grant SELECT on table dinner.corn to corn_kid; grant SELECT on table dinner.corn to everyone`
    • | + +### Python Client Library & Connection Manager + +The other big category of inter-database differences comes with how the client connects to the database and executes queries against the connection. To integrate with dbt, a data platform must have a pre-existing python client library or support ODBC, using a generic python library like pyodbc. + +| Category | Area of differences | Examples | +|------------------------------|-------------------------------------------|-------------------------------------------------------------------------------------------------------------| +| Credentials & authentication | Authentication |
    • Username & password
    • MFA with `boto3` or Okta token
    • | +| Connection opening/closing | Create a new connection to db |
    • `psycopg2.connect(connection_string)`
    • `google.cloud.bigquery.Client(...)`
    • | +| Inserting local data | Load seed .`csv` files into Python memory |
    • `google.cloud.bigquery.Client.load_table_from_file(...)` (BigQuery)
    • `INSERT ... INTO VALUES ...` prepared statement (most other databases)
    • | + + +### How dbt encapsulates and abstracts these differences + +Differences between databases are encoded into discrete areas: + +| Components | Code Path | Function | +|------------------|---------------------------------------------------|-------------------------------------------------------------------------------| +| Python Classes | `adapters/` | Configuration (See above [Python classes](##python classes) | +| Macros | `include//macros/adapters/` | SQL API & statement syntax (for example, how to create schema or how to get table info) | +| Materializations | `include//macros/materializations/` | Table/view/snapshot/ workflow definitions | + + +#### Python Classes + +These classes implement all the methods responsible for: +- Connecting to a database and issuing queries. +- Providing dbt with database-specific configuration information. + +| Class | Description | +|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| AdapterClass | High-level configuration type conversion and any database-specific python methods needed | +| AdapterCredentials | Typed dictionary of possible profiles and associated methods | +| AdapterConnectionManager | All the methods responsible for connecting to a database and issuing queries | +| AdapterRelation | How relation names should be rendered, printed, and quoted. Do relation names use all three parts? `catalog.model_name` (two-part name) or `database.schema.model_name` (three-part name) | +| AdapterColumn | How names should be rendered, and database-specific properties | + +#### Macros + +A set of *macros* responsible for generating SQL that is compliant with the target database. + +#### Materializations + +A set of *materializations* and their corresponding helper macros defined in dbt using jinja and SQL. They codify for dbt how model files should be persisted into the database. + +### Adapter Architecture + + +Below is a diagram of how dbt-postgres, the adapter at the center of dbt-core, works. + + + +## Prerequisites + +It is very important that you have the right skills, and understand the level of difficulty required to make an adapter for your data platform. + +The more you can answer Yes to the below questions, the easier your adapter development (and user-) experience will be. See the [New Adapter Information Sheet wiki](https://github.com/dbt-labs/dbt-core/wiki/New-Adapter-Information-Sheet) for even more specific questions. + +### Training + +- the developer (and any product managers) ideally will have substantial experience as an end-user of dbt. If not, it is highly advised that you at least take the [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) and [Advanced Materializations](https://courses.getdbt.com/courses/advanced-materializations) course. + +### Database + +- Does the database complete transactions fast enough for interactive development? +- Can you execute SQL against the data platform? +- Is there a concept of schemas? +- Does the data platform support ANSI SQL, or at least a subset? + +### Driver / Connection Library + +- Is there a Python-based driver for interacting with the database that is db API 2.0 compliant (e.g. Psycopg2 for Postgres, pyodbc for SQL Server) +- Does it support: prepared statements, multiple statements, or single sign on token authorization to the data platform? + +### Open source software + +- Does your organization have an established process for publishing open source software? + +It is easiest to build an adapter for dbt when the following the /platform in question has: + +- a conventional ANSI-SQL interface (or as close to it as possible), +- a mature connection library/SDK that uses ODBC or Python DB 2 API, and +- a way to enable developers to iterate rapidly with both quick reads and writes + +### Maintaining your new adapter + +When your adapter becomes more popular, and people start using it, you may quickly become the maintainer of an increasingly popular open source project. With this new role, comes some unexpected responsibilities that not only include code maintenance, but also working with a community of users and contributors. To help people understand what to expect of your project, you should communicate your intentions early and often in your adapter documentation or README. Answer questions like, Is this experimental work that people should use at their own risk? Or is this production-grade code that you're committed to maintaining into the future? + +#### Keeping the code compatible with dbt Core + +New minor version releases of `dbt-core` may include changes to the Python interface for adapter plugins, as well as new or updated test cases. The maintainers of `dbt-core` will clearly communicate these changes in documentation and release notes, and they will aim for backwards compatibility whenever possible. + +Patch releases of `dbt-core` will _not_ include breaking changes to adapter-facing code. For more details, see ["About dbt Core versions"](/docs/dbt-versions/core). + +#### Versioning and releasing your adapter + +We strongly encourage you to adopt the following approach when versioning and releasing your plugin: + +- The minor version of your plugin should match the minor version in `dbt-core` (e.g. 1.1.x). +- Aim to release a new version of your plugin for each new minor version of `dbt-core` (once every three months). +- While your plugin is new, and you're iterating on features, aim to offer backwards compatibility and deprecation notices for at least one minor version. As your plugin matures, aim to leave backwards compatibility and deprecation notices in place until the next major version (dbt Core v2). +- Release patch versions of your plugins whenever needed. These patch releases should contain fixes _only_. + +## Build a new adapter + +This step will walk you through the first creating the necessary adapter classes and macros, and provide some resources to help you validate that your new adapter is working correctly. Make sure you've familiarized yourself with the previous steps in this guide. + +Once the adapter is passing most of the functional tests in the previous "Testing a new adapter" step, please let the community know that is available to use by adding the adapter to the ["Supported Data Platforms"](/docs/supported-data-platforms) page by following the steps given in "Documenting your adapter. + +For any questions you may have, don't hesitate to ask in the [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel. The community is very helpful and likely has experienced a similar issue as you. + +### Scaffolding a new adapter + + To create a new adapter plugin from scratch, you can use the [dbt-database-adapter-scaffold](https://github.com/dbt-labs/dbt-database-adapter-scaffold) to trigger an interactive session which will generate a scaffolding for you to build upon. + + Example usage: + + ``` + $ cookiecutter gh:dbt-labs/dbt-database-adapter-scaffold + ``` + +The generated boilerplate starting project will include a basic adapter plugin file structure, examples of macros, high level method descriptions, etc. + +One of the most important choices you will make during the cookiecutter generation will revolve around the field for `is_sql_adapter` which is a boolean used to correctly apply imports for either a `SQLAdapter` or `BaseAdapter`. Knowing which you will need requires a deeper knowledge of your selected database but a few good guides for the choice are. + +- Does your database have a complete SQL API? Can it perform tasks using SQL such as creating schemas, dropping schemas, querying an `information_schema` for metadata calls? If so, it is more likely to be a SQLAdapter where you set `is_sql_adapter` to `True`. +- Most adapters do fall under SQL adapters which is why we chose it as the default `True` value. +- It is very possible to build out a fully functional `BaseAdapter`. This will require a little more ground work as it doesn't come with some prebuilt methods the `SQLAdapter` class provides. See `dbt-bigquery` as a good guide. + +### Implementation Details + +Regardless if you decide to use the cookiecutter template or manually create the plugin, this section will go over each method that is required to be implemented. The table below provides a high-level overview of the classes, methods, and macros you may have to define for your data platform. + +| file | component | purpose | +|---------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `./setup.py` | `setup()` function | adapter meta-data (package name, version, author, homepage, etc) | +| `myadapter/dbt/adapters/myadapter/__init__.py` | `AdapterPlugin` | bundle all the information below into a dbt plugin | +| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterCredentials` class | parameters to connect to and configure the database, via a the chosen Python driver | +| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterConnectionManager` class | telling dbt how to interact with the database w.r.t opening/closing connections, executing queries, and fetching data. Effectively a wrapper around the db API or driver. | +| `myadapter/dbt/include/bigquery/` | a dbt project of macro "overrides" in the format of "myadapter__" | any differences in SQL syntax for regular db operations will be modified here from the global_project (e.g. "Create Table As Select", "Get all relations in the current schema", etc) | +| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterConfig` | database- and relation-level configs and | +| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterAdapter` | for changing _how_ dbt performs operations like macros and other needed Python functionality | +| `myadapter/dbt/adapters/myadapter/column.py` | `MyAdapterColumn` | for defining database-specific column such as datatype mappings | + +### Editing `setup.py` + +Edit the file at `myadapter/setup.py` and fill in the missing information. + +You can skip this step if you passed the arguments for `email`, `url`, `author`, and `dependencies` to the cookiecutter template script. If you plan on having nested macro folder structures, you may need to add entries to `package_data` so your macro source files get installed. + +### Editing the connection manager + +Edit the connection manager at `myadapter/dbt/adapters/myadapter/connections.py`. This file is defined in the sections below. + +#### The Credentials class + +The credentials class defines all of the database-specific credentials (e.g. `username` and `password`) that users will need in the [connection profile](/docs/supported-data-platforms) for your new adapter. Each credentials contract should subclass dbt.adapters.base.Credentials, and be implemented as a python dataclass. + +Note that the base class includes required database and schema fields, as dbt uses those values internally. + +For example, if your adapter requires a host, integer port, username string, and password string, but host is the only required field, you'd add definitions for those new properties to the class as types, like this: + + + +```python + +from dataclasses import dataclass +from typing import Optional + +from dbt.adapters.base import Credentials + + +@dataclass +class MyAdapterCredentials(Credentials): + host: str + port: int = 1337 + username: Optional[str] = None + password: Optional[str] = None + + @property + def type(self): + return 'myadapter' + + @property + def unique_field(self): + """ + Hashed and included in anonymous telemetry to track adapter adoption. + Pick a field that can uniquely identify one team/organization building with this adapter + """ + return self.host + + def _connection_keys(self): + """ + List of keys to display in the `dbt debug` output. + """ + return ('host', 'port', 'database', 'username') +``` + + + +There are a few things you can do to make it easier for users when connecting to your database: + +- Be sure to implement the Credentials' `_connection_keys` method shown above. This method will return the keys that should be displayed in the output of the `dbt debug` command. As a general rule, it's good to return all the arguments used in connecting to the actual database except the password (even optional arguments). +- Create a `profile_template.yml` to enable configuration prompts for a brand-new user setting up a connection profile via the [`dbt init` command](/reference/commands/init). You will find more details in the following steps. +- You may also want to define an `ALIASES` mapping on your Credentials class to include any config names you want users to be able to use in place of 'database' or 'schema'. For example if everyone using the MyAdapter database calls their databases "collections", you might do: + + + +```python +@dataclass +class MyAdapterCredentials(Credentials): + host: str + port: int = 1337 + username: Optional[str] = None + password: Optional[str] = None + + ALIASES = { + 'collection': 'database', + } +``` + + + +Then users can use `collection` OR `database` in their `profiles.yml`, `dbt_project.yml`, or `config()` calls to set the database. + +#### `ConnectionManager` class methods + +Once credentials are configured, you'll need to implement some connection-oriented methods. They are enumerated in the SQLConnectionManager docstring, but an overview will also be provided here. + +**Methods to implement:** + +- `open` +- `get_response` +- `cancel` +- `exception_handler` +- `standardize_grants_dict` + +##### `open(cls, connection)` + +`open()` is a classmethod that gets a connection object (which could be in any state, but will have a `Credentials` object with the attributes you defined above) and moves it to the 'open' state. + +Generally this means doing the following: + - if the connection is open already, log and return it. + - If a database needed changes to the underlying connection before re-use, that would happen here + - create a connection handle using the underlying database library using the credentials + - on success: + - set connection.state to `'open'` + - set connection.handle to the handle object + - this is what must have a `cursor()` method that returns a cursor! + - on error: + - set connection.state to `'fail'` + - set connection.handle to `None` + - raise a `dbt.exceptions.FailedToConnectException` with the error and any other relevant information + +For example: + + + +```python + @classmethod + def open(cls, connection): + if connection.state == 'open': + logger.debug('Connection is already open, skipping open.') + return connection + + credentials = connection.credentials + + try: + handle = myadapter_library.connect( + host=credentials.host, + port=credentials.port, + username=credentials.username, + password=credentials.password, + catalog=credentials.database + ) + connection.state = 'open' + connection.handle = handle + return connection +``` + + + +##### `get_response(cls, cursor)` + +`get_response` is a classmethod that gets a cursor object and returns adapter-specific information about the last executed command. The return value should be an `AdapterResponse` object that includes items such as `code`, `rows_affected`, `bytes_processed`, and a summary `_message` for logging to stdout. + + + +```python + @classmethod + def get_response(cls, cursor) -> AdapterResponse: + code = cursor.sqlstate or "OK" + rows = cursor.rowcount + status_message = f"{code} {rows}" + return AdapterResponse( + _message=status_message, + code=code, + rows_affected=rows + ) +``` + + + +##### `cancel(self, connection)` + +`cancel` is an instance method that gets a connection object and attempts to cancel any ongoing queries, which is database dependent. Some databases don't support the concept of cancellation, they can simply implement it via 'pass' and their adapter classes should implement an `is_cancelable` that returns False - On ctrl+c connections may remain running. This method must be implemented carefully, as the affected connection will likely be in use in a different thread. + + + +```python + def cancel(self, connection): + tid = connection.handle.transaction_id() + sql = 'select cancel_transaction({})'.format(tid) + logger.debug("Cancelling query '{}' ({})".format(connection_name, pid)) + _, cursor = self.add_query(sql, 'master') + res = cursor.fetchone() + logger.debug("Canceled query '{}': {}".format(connection_name, res)) +``` + + + +##### `exception_handler(self, sql, connection_name='master')` + +`exception_handler` is an instance method that returns a context manager that will handle exceptions raised by running queries, catch them, log appropriately, and then raise exceptions dbt knows how to handle. + +If you use the (highly recommended) `@contextmanager` decorator, you only have to wrap a `yield` inside a `try` block, like so: + + + +```python + @contextmanager + def exception_handler(self, sql: str): + try: + yield + except myadapter_library.DatabaseError as exc: + self.release(connection_name) + + logger.debug('myadapter error: {}'.format(str(e))) + raise dbt.exceptions.DatabaseException(str(exc)) + except Exception as exc: + logger.debug("Error running SQL: {}".format(sql)) + logger.debug("Rolling back transaction.") + self.release(connection_name) + raise dbt.exceptions.RuntimeException(str(exc)) +``` + + + +##### `standardize_grants_dict(self, grants_table: agate.Table) -> dict` + +`standardize_grants_dict` is an method that returns the dbt-standardized grants dictionary that matches how users configure grants now in dbt. The input is the result of `SHOW GRANTS ON {{model}}` call loaded into an agate table. + +If there's any massaging of agate table containing the results, of `SHOW GRANTS ON {{model}}`, that can't easily be accomplished in SQL, it can be done here. For example, the SQL to show grants _should_ filter OUT any grants TO the current user/role (e.g. OWNERSHIP). If that's not possible in SQL, it can be done in this method instead. + + + +```python + @available + def standardize_grants_dict(self, grants_table: agate.Table) -> dict: + """ + :param grants_table: An agate table containing the query result of + the SQL returned by get_show_grant_sql + :return: A standardized dictionary matching the `grants` config + :rtype: dict + """ + grants_dict: Dict[str, List[str]] = {} + for row in grants_table: + grantee = row["grantee"] + privilege = row["privilege_type"] + if privilege in grants_dict.keys(): + grants_dict[privilege].append(grantee) + else: + grants_dict.update({privilege: [grantee]}) + return grants_dict +``` + + + +### Editing the adapter implementation + +Edit the connection manager at `myadapter/dbt/adapters/myadapter/impl.py` + +Very little is required to implement the adapter itself. On some adapters, you will not need to override anything. On others, you'll likely need to override some of the ``convert_*`` classmethods, or override the `is_cancelable` classmethod on others to return `False`. + +#### `datenow()` + +This classmethod provides the adapter's canonical date function. This is not used but is required– anyway on all adapters. + + + +```python + @classmethod + def date_function(cls): + return 'datenow()' +``` + + + +### Editing SQL logic + +dbt implements specific SQL operations using jinja macros. While reasonable defaults are provided for many such operations (like `create_schema`, `drop_schema`, `create_table`, etc), you may need to override one or more of macros when building a new adapter. + +#### Required macros + +The following macros must be implemented, but you can override their behavior for your adapter using the "dispatch" pattern described below. Macros marked (required) do not have a valid default implementation, and are required for dbt to operate. + +- `alter_column_type` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L37-L55)) +- `check_schema_exists` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L43-L55)) +- `create_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L1-L9)) +- `drop_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L34-L42)) +- `drop_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L12-L20)) +- `get_columns_in_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L1-L8)) (required) +- `list_relations_without_caching` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L58-L65)) (required) +- `list_schemas` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L29-L40)) +- `rename_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L56-L65)) +- `truncate_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L45-L53)) +- `current_timestamp` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/freshness.sql#L1-L8)) (required) +- `copy_grants` + +#### Adapter dispatch + +Most modern databases support a majority of the standard SQL spec. There are some databases that _do not_ support critical aspects of the SQL spec however, or they provide their own nonstandard mechanisms for implementing the same functionality. To account for these variations in SQL support, dbt provides a mechanism called [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch) for macros. With this feature, macros can be overridden for specific adapters. This makes it possible to implement high-level methods (like "create ") in a database-specific way. + + + +```jinja2 + +{# dbt will call this macro by name, providing any arguments #} +{% macro create_table_as(temporary, relation, sql) -%} + + {# dbt will dispatch the macro call to the relevant macro #} + {{ return( + adapter.dispatch('create_table_as')(temporary, relation, sql) + ) }} +{%- endmacro %} + + + +{# If no macro matches the specified adapter, "default" will be used #} +{% macro default__create_table_as(temporary, relation, sql) -%} + ... +{%- endmacro %} + + + +{# Example which defines special logic for Redshift #} +{% macro redshift__create_table_as(temporary, relation, sql) -%} + ... +{%- endmacro %} + + + +{# Example which defines special logic for BigQuery #} +{% macro bigquery__create_table_as(temporary, relation, sql) -%} + ... +{%- endmacro %} +``` + + + +The `adapter.dispatch()` macro takes a second argument, `packages`, which represents a set of "search namespaces" in which to find potential implementations of a dispatched macro. This allows users of community-supported adapters to extend or "shim" dispatched macros from common packages, such as `dbt-utils`, with adapter-specific versions in their own project or other installed packages. See: + +- "Shim" package examples: [`spark-utils`](https://github.com/dbt-labs/spark-utils), [`tsql-utils`](https://github.com/dbt-msft/tsql-utils) +- [`adapter.dispatch` docs](/reference/dbt-jinja-functions/dispatch) + +#### Overriding adapter methods + +While much of dbt's adapter-specific functionality can be modified in adapter macros, it can also make sense to override adapter methods directly. In this example, assume that a database does not support a `cascade` parameter to `drop schema`. Instead, we can implement an approximation where we drop each relation and then drop the schema. + + + +```python + def drop_schema(self, relation: BaseRelation): + relations = self.list_relations( + database=relation.database, + schema=relation.schema + ) + for relation in relations: + self.drop_relation(relation) + super().drop_schema(relation) +``` + + + +#### Grants Macros + +See [this GitHub discussion](https://github.com/dbt-labs/dbt-core/discussions/5468) for information on the macros required for `GRANT` statements: + +### Other files + +#### `profile_template.yml` + +In order to enable the [`dbt init` command](/reference/commands/init) to prompt users when setting up a new project and connection profile, you should include a **profile template**. The filepath needs to be `dbt/include//profile_template.yml`. It's possible to provide hints, default values, and conditional prompts based on connection methods that require different supporting attributes. Users will also be able to include custom versions of this file in their own projects, with fixed values specific to their organization, to support their colleagues when using your dbt adapter for the first time. + +See examples: + +- [dbt-postgres](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml) +- [dbt-redshift](https://github.com/dbt-labs/dbt-redshift/blob/main/dbt/include/redshift/profile_template.yml) +- [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/include/snowflake/profile_template.yml) +- [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery/blob/main/dbt/include/bigquery/profile_template.yml) + +#### `__version__.py` + +To assure that `dbt --version` provides the latest dbt core version the adapter supports, be sure include a `__version__.py` file. The filepath will be `dbt/adapters//__version__.py`. We recommend using the latest dbt core version and as the adapter is made compatible with later versions, this file will need to be updated. For a sample file, check out this [example](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/adapters/snowflake/__version__.py). + +It should be noted that both of these files are included in the bootstrapped output of the `dbt-database-adapter-scaffold` so when using the scaffolding, these files will be included. + +## Test your adapter + +:::info + +Previously, we offered a packaged suite of tests for dbt adapter functionality: [`pytest-dbt-adapter`](https://github.com/dbt-labs/dbt-adapter-tests). We are deprecating that suite, in favor of the newer testing framework outlined in this document. + +::: + +This document has two sections: + +1. Refer to "About the testing framework" for a description of the standard framework that we maintain for using pytest together with dbt. It includes an example that shows the anatomy of a simple test case. +2. Refer to "Testing your adapter" for a step-by-step guide for using our out-of-the-box suite of "basic" tests, which will validate that your adapter meets a baseline of dbt functionality. + +### Testing prerequisites + +- Your adapter must be compatible with dbt-core **v1.1** or newer +- You should be familiar with **pytest**: + +### About the testing framework + +dbt-core offers a standard framework for running pre-built functional tests, and for defining your own tests. The core testing framework is built using `pytest`, a mature and standard library for testing Python projects. + +The **[`tests` module](https://github.com/dbt-labs/dbt-core/tree/HEAD/core/dbt/tests)** within `dbt-core` includes basic utilities for setting up pytest + dbt. These are used by all "pre-built" functional tests, and make it possible to quickly write your own tests. + +Those utilities allow you to do three basic things: + +1. **Quickly set up a dbt "project."** Define project resources via methods such as `models()` and `seeds()`. Use `project_config_update()` to pass configurations into `dbt_project.yml`. +2. **Define a sequence of dbt commands.** The most important utility is `run_dbt()`, which returns the [results](/reference/dbt-classes#result-objects) of each dbt command. It takes a list of CLI specifiers (subcommand + flags), as well as an optional second argument, `expect_pass=False`, for cases where you expect the command to fail. +3. **Validate the results of those dbt commands.** For example, `check_relations_equal()` asserts that two database objects have the same structure and content. You can also write your own `assert` statements, by inspecting the results of a dbt command, or querying arbitrary database objects with `project.run_sql()`. + +You can see the full suite of utilities, with arguments and annotations, in [`util.py`](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/tests/util.py). You'll also see them crop up across a number of test cases. While all utilities are intended to be reusable, you won't need all of them for every test. In the example below, we'll show a simple test case that uses only a few utilities. + +#### Example: a simple test case + +This example will show you the anatomy of a test case using dbt + pytest. We will create reusable components, combine them to form a dbt "project", and define a sequence of dbt commands. Then, we'll use Python `assert` statements to ensure those commands succeed (or fail) as we expect. + +In ["Getting started running basic tests,"](#getting-started-running-basic-tests) we'll offer step-by-step instructions for installing and configuring `pytest`, so that you can run it on your own machine. For now, it's more important to see how the pieces of a test case fit together. + +This example includes a seed, a model, and two tests—one of which will fail. + +1. Define Python strings that will represent the file contents in your dbt project. Defining these in a separate file enables you to reuse the same components across different test cases. The pytest name for this type of reusable component is "fixture." + + + +```python +# seeds/my_seed.csv +my_seed_csv = """ +id,name,some_date +1,Easton,1981-05-20T06:46:51 +2,Lillian,1978-09-03T18:10:33 +3,Jeremiah,1982-03-11T03:59:51 +4,Nolan,1976-05-06T20:21:35 +""".lstrip() + +# models/my_model.sql +my_model_sql = """ +select * from {{ ref('my_seed') }} +union all +select null as id, null as name, null as some_date +""" + +# models/my_model.yml +my_model_yml = """ +version: 2 +models: + - name: my_model + columns: + - name: id + tests: + - unique + - not_null # this test will fail +""" +``` + + + +2. Use the "fixtures" to define the project for your test case. These fixtures are always scoped to the **class**, where the class represents one test case—that is, one dbt project or scenario. (The same test case can be used for one or more actual tests, which we'll see in step 3.) Following the default pytest configurations, the file name must begin with `test_`, and the class name must begin with `Test`. + + + +```python +import pytest +from dbt.tests.util import run_dbt + +# our file contents +from tests.functional.example.fixtures import ( + my_seed_csv, + my_model_sql, + my_model_yml, +) + +# class must begin with 'Test' +class TestExample: + """ + Methods in this class will be of two types: + 1. Fixtures defining the dbt "project" for this test case. + These are scoped to the class, and reused for all tests in the class. + 2. Actual tests, whose names begin with 'test_'. + These define sequences of dbt commands and 'assert' statements. + """ + + # configuration in dbt_project.yml + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "name": "example", + "models": {"+materialized": "view"} + } + + # everything that goes in the "seeds" directory + @pytest.fixture(scope="class") + def seeds(self): + return { + "my_seed.csv": my_seed_csv, + } + + # everything that goes in the "models" directory + @pytest.fixture(scope="class") + def models(self): + return { + "my_model.sql": my_model_sql, + "my_model.yml": my_model_yml, + } + + # continues below +``` + + + +3. Now that we've set up our project, it's time to define a sequence of dbt commands and assertions. We define one or more methods in the same file, on the same class (`TestExampleFailingTest`), whose names begin with `test_`. These methods share the same setup (project scenario) from above, but they can be run independently by pytest—so they shouldn't depend on each other in any way. + + + +```python + # continued from above + + # The actual sequence of dbt commands and assertions + # pytest will take care of all "setup" + "teardown" + def test_run_seed_test(self, project): + """ + Seed, then run, then test. We expect one of the tests to fail + An alternative pattern is to use pytest "xfail" (see below) + """ + # seed seeds + results = run_dbt(["seed"]) + assert len(results) == 1 + # run models + results = run_dbt(["run"]) + assert len(results) == 1 + # test tests + results = run_dbt(["test"], expect_pass = False) # expect failing test + assert len(results) == 2 + # validate that the results include one pass and one failure + result_statuses = sorted(r.status for r in results) + assert result_statuses == ["fail", "pass"] + + @pytest.mark.xfail + def test_build(self, project): + """Expect a failing test""" + # do it all + results = run_dbt(["build"]) +``` + + + +3. Our test is ready to run! The last step is to invoke `pytest` from your command line. We'll walk through the actual setup and configuration of `pytest` in the next section. + + + +```sh +$ python3 -m pytest tests/functional/test_example.py +=========================== test session starts ============================ +platform ... -- Python ..., pytest-..., pluggy-... +rootdir: ... +plugins: ... + +tests/functional/test_example.py .X [100%] + +======================= 1 passed, 1 xpassed in 1.38s ======================= +``` + + + +You can find more ways to run tests, along with a full command reference, in the [pytest usage docs](https://docs.pytest.org/how-to/usage.html). + +We've found the `-s` flag (or `--capture=no`) helpful to print logs from the underlying dbt invocations, and to step into an interactive debugger if you've added one. You can also use environment variables to set [global dbt configs](/reference/global-configs/about-global-configs), such as `DBT_DEBUG` (to show debug-level logs). + +### Testing this adapter + +Anyone who installs `dbt-core`, and wishes to define their own test cases, can use the framework presented in the first section. The framework is especially useful for testing standard dbt behavior across different databases. + +To that end, we have built and made available a [package of reusable adapter test cases](https://github.com/dbt-labs/dbt-core/tree/HEAD/tests/adapter), for creators and maintainers of adapter plugins. These test cases cover basic expected functionality, as well as functionality that frequently requires different implementations across databases. + +For the time being, this package is also located within the `dbt-core` repository, but separate from the `dbt-core` Python package. + +### Categories of tests + +In the course of creating and maintaining your adapter, it's likely that you will end up implementing tests that fall into three broad categories: + +1. **Basic tests** that every adapter plugin is expected to pass. These are defined in `tests.adapter.basic`. Given differences across data platforms, these may require slight modification or reimplementation. Significantly overriding or disabling these tests should be with good reason, since each represents basic functionality expected by dbt users. For example, if your adapter does not support incremental models, you should disable the test, [by marking it with `skip` or `xfail`](https://docs.pytest.org/en/latest/how-to/skipping.html), as well as noting that limitation in any documentation, READMEs, and usage guides that accompany your adapter. + +2. **Optional tests**, for second-order functionality that is common across plugins, but not required for basic use. Your plugin can opt into these test cases by inheriting existing ones, or reimplementing them with adjustments. For now, this category includes all tests located outside the `basic` subdirectory. More tests will be added as we convert older tests defined on dbt-core and mature plugins to use the standard framework. + +3. **Custom tests**, for behavior that is specific to your adapter / data platform. Each has its own specialties and idiosyncracies. We encourage you to use the same `pytest`-based framework, utilities, and fixtures to write your own custom tests for functionality that is unique to your adapter. + +If you run into an issue with the core framework, or the basic/optional test cases—or if you've written a custom test that you believe would be relevant and useful for other adapter plugin developers—please open an issue or PR in the `dbt-core` repository on GitHub. + +### Getting started running basic tests + +In this section, we'll walk through the three steps to start running our basic test cases on your adapter plugin: + +1. Install dependencies +2. Set up and configure pytest +3. Define test cases + +### Install dependencies + +You should already have a virtual environment with `dbt-core` and your adapter plugin installed. You'll also need to install: + +- [`pytest`](https://pypi.org/project/pytest/) +- [`dbt-tests-adapter`](https://pypi.org/project/dbt-tests-adapter/), the set of common test cases +- (optional) [`pytest` plugins](https://docs.pytest.org/en/7.0.x/reference/plugin_list.html)--we'll use `pytest-dotenv` below + +Or specify all dependencies in a requirements file like: + + +```txt +pytest +pytest-dotenv +dbt-tests-adapter +``` + + + +```sh +pip install -r dev_requirements.txt +``` + +### Set up and configure pytest + +First, set yourself up to run `pytest` by creating a file named `pytest.ini` at the root of your repository: + + + +```python +[pytest] +filterwarnings = + ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning + ignore:unclosed file .*:ResourceWarning +env_files = + test.env # uses pytest-dotenv plugin + # this allows you to store env vars for database connection in a file named test.env + # rather than passing them in every CLI command, or setting in `PYTEST_ADDOPTS` + # be sure to add "test.env" to .gitignore as well! +testpaths = + tests/functional # name per convention +``` + + + +Then, create a configuration file within your tests directory. In it, you'll want to define all necessary profile configuration for connecting to your data platform in local development and continuous integration. We recommend setting these values with environment variables, since this file will be checked into version control. + + + +```python +import pytest +import os + +# Import the standard functional fixtures as a plugin +# Note: fixtures with session scope need to be local +pytest_plugins = ["dbt.tests.fixtures.project"] + +# The profile dictionary, used to write out profiles.yml +# dbt will supply a unique schema per test, so we do not specify 'schema' here +@pytest.fixture(scope="class") +def dbt_profile_target(): + return { + 'type': '', + 'threads': 1, + 'host': os.getenv('HOST_ENV_VAR_NAME'), + 'user': os.getenv('USER_ENV_VAR_NAME'), + ... + } +``` + + + +### Define test cases + +As in the example above, each test case is defined as a class, and has its own "project" setup. To get started, you can import all basic test cases and try running them without changes. + + + +```python +import pytest + +from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations +from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests +from dbt.tests.adapter.basic.test_singular_tests_ephemeral import BaseSingularTestsEphemeral +from dbt.tests.adapter.basic.test_empty import BaseEmpty +from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral +from dbt.tests.adapter.basic.test_incremental import BaseIncremental +from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests +from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols +from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp +from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod + +class TestSimpleMaterializationsMyAdapter(BaseSimpleMaterializations): + pass + + +class TestSingularTestsMyAdapter(BaseSingularTests): + pass + + +class TestSingularTestsEphemeralMyAdapter(BaseSingularTestsEphemeral): + pass + + +class TestEmptyMyAdapter(BaseEmpty): + pass + + +class TestEphemeralMyAdapter(BaseEphemeral): + pass + + +class TestIncrementalMyAdapter(BaseIncremental): + pass + + +class TestGenericTestsMyAdapter(BaseGenericTests): + pass + + +class TestSnapshotCheckColsMyAdapter(BaseSnapshotCheckCols): + pass + + +class TestSnapshotTimestampMyAdapter(BaseSnapshotTimestamp): + pass + + +class TestBaseAdapterMethod(BaseAdapterMethod): + pass +``` + + + +Finally, run pytest: + +```sh +python3 -m pytest tests/functional +``` + +### Modifying test cases + +You may need to make slight modifications in a specific test case to get it passing on your adapter. The mechanism to do this is simple: rather than simply inheriting the "base" test with `pass`, you can redefine any of its fixtures or test methods. + +For instance, on Redshift, we need to explicitly cast a column in the fixture input seed to use data type `varchar(64)`: + + + +```python +import pytest +from dbt.tests.adapter.basic.files import seeds_base_csv, seeds_added_csv, seeds_newcolumns_csv +from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols + +# set the datatype of the name column in the 'added' seed so it +# can hold the '_update' that's added +schema_seed_added_yml = """ +version: 2 +seeds: + - name: added + config: + column_types: + name: varchar(64) +""" + +class TestSnapshotCheckColsRedshift(BaseSnapshotCheckCols): + # Redshift defines the 'name' column such that it's not big enough + # to hold the '_update' added in the test. + @pytest.fixture(scope="class") + def models(self): + return { + "base.csv": seeds_base_csv, + "added.csv": seeds_added_csv, + "seeds.yml": schema_seed_added_yml, + } +``` + + + +As another example, the `dbt-bigquery` adapter asks users to "authorize" replacing a with a by supplying the `--full-refresh` flag. The reason: In the table logic, a view by the same name must first be dropped; if the table query fails, the model will be missing. + +Knowing this possibility, the "base" test case offers a `require_full_refresh` switch on the `test_config` fixture class. For BigQuery, we'll switch it on: + + + +```python +import pytest +from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations + +class TestSimpleMaterializationsBigQuery(BaseSimpleMaterializations): + @pytest.fixture(scope="class") + def test_config(self): + # effect: add '--full-refresh' flag in requisite 'dbt run' step + return {"require_full_refresh": True} +``` + + + +It's always worth asking whether the required modifications represent gaps in perceived or expected dbt functionality. Are these simple implementation details, which any user of this database would understand? Are they limitations worth documenting? + +If, on the other hand, they represent poor assumptions in the "basic" test cases, which fail to account for a common pattern in other types of databases-—please open an issue or PR in the `dbt-core` repository on GitHub. + +### Running with multiple profiles + +Some databases support multiple connection methods, which map to actually different functionality behind the scenes. For instance, the `dbt-spark` adapter supports connections to Apache Spark clusters _and_ Databricks runtimes, which supports additional functionality out of the box, enabled by the Delta file format. + + + +```python +def pytest_addoption(parser): + parser.addoption("--profile", action="store", default="apache_spark", type=str) + + +# Using @pytest.mark.skip_profile('apache_spark') uses the 'skip_by_profile_type' +# autouse fixture below +def pytest_configure(config): + config.addinivalue_line( + "markers", + "skip_profile(profile): skip test for the given profile", + ) + +@pytest.fixture(scope="session") +def dbt_profile_target(request): + profile_type = request.config.getoption("--profile") + elif profile_type == "databricks_sql_endpoint": + target = databricks_sql_endpoint_target() + elif profile_type == "apache_spark": + target = apache_spark_target() + else: + raise ValueError(f"Invalid profile type '{profile_type}'") + return target + +def apache_spark_target(): + return { + "type": "spark", + "host": "localhost", + ... + } + +def databricks_sql_endpoint_target(): + return { + "type": "spark", + "host": os.getenv("DBT_DATABRICKS_HOST_NAME"), + ... + } + +@pytest.fixture(autouse=True) +def skip_by_profile_type(request): + profile_type = request.config.getoption("--profile") + if request.node.get_closest_marker("skip_profile"): + for skip_profile_type in request.node.get_closest_marker("skip_profile").args: + if skip_profile_type == profile_type: + pytest.skip("skipped on '{profile_type}' profile") +``` + + + +If there are tests that _shouldn't_ run for a given profile: + + + +```python +# Snapshots require access to the Delta file format, available on our Databricks connection, +# so let's skip on Apache Spark +@pytest.mark.skip_profile('apache_spark') +class TestSnapshotCheckColsSpark(BaseSnapshotCheckCols): + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "seeds": { + "+file_format": "delta", + }, + "snapshots": { + "+file_format": "delta", + } + } +``` + + + +Finally: + +```sh +python3 -m pytest tests/functional --profile apache_spark +python3 -m pytest tests/functional --profile databricks_sql_endpoint +``` + +## Document a new adapter + +If you've already built, and tested your adapter, it's time to document it so the dbt community will know that it exists and how to use it. + +### Making your adapter available + +Many community members maintain their adapter plugins under open source licenses. If you're interested in doing this, we recommend: + +- Hosting on a public git provider (for example, GitHub or Gitlab) +- Publishing to [PyPI](https://pypi.org/) +- Adding to the list of ["Supported Data Platforms"](/docs/supported-data-platforms#community-supported) (more info below) + +### General Guidelines + +To best inform the dbt community of the new adapter, you should contribute to the dbt's open-source documentation site, which uses the [Docusaurus project](https://docusaurus.io/). This is the site you're currently on! + +### Conventions + +Each `.md` file you create needs a header as shown below. The document id will also need to be added to the config file: `website/sidebars.js`. + +```md +--- +title: "Documenting a new adapter" +id: "documenting-a-new-adapter" +--- +``` + +### Single Source of Truth + +We ask our adapter maintainers to use the [docs.getdbt.com repo](https://github.com/dbt-labs/docs.getdbt.com) (i.e. this site) as the single-source-of-truth for documentation rather than having to maintain the same set of information in three different places. The adapter repo's `README.md` and the data platform's documentation pages should simply link to the corresponding page on this docs site. Keep reading for more information on what should and shouldn't be included on the dbt docs site. + +### Assumed Knowledge + +To simplify things, assume the reader of this documentation already knows how both dbt and your data platform works. There's already great material for how to learn dbt and the data platform out there. The documentation we're asking you to add should be what a user who is already profiecient in both dbt and your data platform would need to know in order to use both. Effectively that boils down to two things: how to connect, and how to configure. + +### Topics and Pages to Cover + +The following subjects need to be addressed across three pages of this docs site to have your data platform be listed on our documentation. After the corresponding pull request is merged, we ask that you link to these pages from your adapter repo's `REAMDE` as well as from your product documentation. + + To contribute, all you will have to do make the changes listed in the table below. + +| How To... | File to change within `/website/docs/` | Action | Info to Include | +|----------------------|--------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Connect | `/docs/core/connect-data-platform/{MY-DATA-PLATFORM}-setup.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. | +| Configure | `reference/resource-configs/{MY-DATA-PLATFORM}-configs.md` | Create | What options and configuration specific to your data platform do users need to know? e.g. table distribution and indexing options, column_quoting policy, which incremental strategies are supported | +| Discover and Install | `docs/supported-data-platforms.md` | Modify | Is it a vendor- or community- supported adapter? How to install Python adapter package? Ideally with pip and PyPI hosted package, but can also use `git+` link to GitHub Repo | +| Add link to sidebar | `website/sidebars.js` | Modify | Add the document id to the correct location in the sidebar menu | + +For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-setup.md` and add it to the `/website/docs/core/connect-data-platform/` directory. + +### Example PRs to add new adapter documentation + +Below are some recent pull requests made by partners to document their data platform's adapter: + +- [TiDB](https://github.com/dbt-labs/docs.getdbt.com/pull/1309) +- [SingleStore](https://github.com/dbt-labs/docs.getdbt.com/pull/1044) +- [Firebolt](https://github.com/dbt-labs/docs.getdbt.com/pull/941) + +## Promote a new adapter + +The most important thing here is recognizing that people are successful in the community when they join, first and foremost, to engage authentically. + +What does authentic engagement look like? It’s challenging to define explicit rules. One good rule of thumb is to treat people with dignity and respect. + +Contributors to the community should think of contribution _as the end itself,_ not a means toward other business KPIs (leads, community members, etc.). [We are a mission-driven company.](https://www.getdbt.com/dbt-labs/values/) Some ways to know if you’re authentically engaging: + +- Is an engagement’s _primary_ purpose of sharing knowledge and resources or building brand engagement? +- Imagine you didn’t work at the org you do — can you imagine yourself still writing this? +- Is it written in formal / marketing language, or does it sound like you, the human? + +### Who should join the dbt community slack? + +- People who have insight into what it means to do hands-on [analytics engineering](https://www.getdbt.com/analytics-engineering/) work + The dbt Community Slack workspace is fundamentally a place for analytics practitioners to interact with each other — the closer the users are in the community to actual data/analytics engineering work, the more natural their engagement will be (leading to better outcomes for partners and the community). + +- DevRel practitioners with strong focus + DevRel practitioners often have a strong analytics background and a good understanding of the community. It’s essential to be sure they are focused on _contributing,_ not on driving community metrics for partner org (such as signing people up for their slack or events). The metrics will rise naturally through authentic engagement. + +- Founder and executives who are interested in directly engaging with the community + This is either incredibly successful or not at all depending on the profile of the founder. Typically, this works best when the founder has a practitioner-level of technical understanding and is interested in joining not to promote, but to learn and hear from users. + +- Software Engineers at partner products that are building and supporting integrations with either dbt Core or dbt Cloud + This is successful when the engineers are familiar with dbt as a product or at least have taken our training course. The Slack is often a place where end-user questions and feedback is initially shared, so it is recommended that someone technical from the team be present. There are also a handful of channels aimed at those building integrations, which tend to be a font of knowledge. + +### Who might struggle in the dbt community + +- People in marketing roles + dbt Slack is not a marketing channel. Attempts to use it as such invariably fall flat and can even lead to people having a negative view of a product. This doesn’t mean that dbt can’t serve marketing objectives, but a long-term commitment to engagement is the only proven method to do this sustainably. + +- People in product roles + The dbt Community can be an invaluable source of feedback on a product. There are two primary ways this can happen — organically (community members proactively suggesting a new feature) and via direct calls for feedback and user research. Immediate calls for engagement must be done in your dedicated #tools channel. Direct calls should be used sparingly, as they can overwhelm more organic discussions and feedback. + +### Who is the audience for an adapter release? + + A new adapter is likely to drive huge community interest from several groups of people: + - People who are currently using the database that the adapter is supporting + - People who may be adopting the database in the near future. + - People who are interested in dbt development in general. + +The database users will be your primary audience and the most helpful in achieving success. Engage them directly in the adapter’s dedicated Slack channel. If one does not exist already, reach out in #channel-requests, and we will get one made for you and include it in an announcement about new channels. + +The final group is where non-slack community engagement becomes important. Twitter and LinkedIn are both great places to interact with a broad audience. A well-orchestrated adapter release can generate impactful and authentic engagement. + +### How to message the initial rollout and follow-up content + +Tell a story that engages dbt users and the community. Highlight new use cases and functionality unlocked by the adapter in a way that will resonate with each segment. + +- Existing users of your technology who are new to dbt + - Provide a general overview of the value dbt will deliver to your users. This can lean on dbt's messaging and talking points which are laid out in the [dbt viewpoint.](/community/resources/viewpoint) + - Give examples of a rollout that speaks to the overall value of dbt and your product. + +- Users who are already familiar with dbt and the community + - Consider unique use cases or advantages your adapter provide over existing adapters. Who will be excited for this? + - Contribute to the dbt Community and ensure that dbt users on your adapter are well supported (tutorial content, packages, documentation, etc). + - Example of a rollout that is compelling for those familiar with dbt: [Firebolt](https://www.linkedin.com/feed/update/urn:li:activity:6879090752459182080/) + +### Tactically manage distribution of content about new or existing adapters + +There are tactical pieces on how and where to share that help ensure success. + +- On slack: + - #i-made-this channel — this channel has a policy against “marketing” and “content marketing” posts, but it should be successful if you write your content with the above guidelines in mind. Even with that, it’s important to post here sparingly. + - Your own database / tool channel — this is where the people who have opted in to receive communications from you and always a great place to share things that are relevant to them. + +- On social media: + - Twitter + - LinkedIn + - Social media posts _from the author_ or an individual connected to the project tend to have better engagement than posts from a company or organization account. + - Ask your partner representative about: + - Retweets and shares from the official dbt Labs accounts. + - Flagging posts internally at dbt Labs to get individual employees to share. + +#### Measuring engagement + +You don’t need 1000 people in a channel to succeed, but you need at least a few active participants who can make it feel lived in. If you’re comfortable working in public, this could be members of your team, or it can be a few people who you know that are highly engaged and would be interested in participating. Having even 2 or 3 regulars hanging out in a channel is all that’s needed for a successful start and is, in fact, much more impactful than 250 people that never post. + +### How to announce a new adapter + +We’d recommend _against_ boilerplate announcements and encourage finding a unique voice. That being said, there are a couple of things that we’d want to include: + +- A summary of the value prop of your database / technology for users who aren’t familiar. +- The personas that might be interested in this news. +- A description of what the adapter _is_. For example: + > With the release of our new dbt adapter, you’ll be able to to use dbt to model and transform your data in [name-of-your-org] +- Particular or unique use cases or functionality unlocked by the adapter. +- Plans for future / ongoing support / development. +- The link to the documentation for using the adapter on the dbt Labs docs site. +- An announcement blog. + +#### Announcing new release versions of existing adapters + +This can vary substantially depending on the nature of the release but a good baseline is the types of release messages that [we put out in the #dbt-releases](https://getdbt.slack.com/archives/C37J8BQEL/p1651242161526509) channel. + +![Full Release Post](/img/adapter-guide/0-full-release-notes.png) + +Breaking this down: + +- Visually distinctive announcement - make it clear this is a release + +- Short written description of what is in the release + +- Links to additional resources + +- Implementation instructions: + +- Future plans + +- Contributor recognition (if applicable) + + + +## Verify a new adapter + +The very first data platform dbt supported was Redshift followed quickly by Postgres (([dbt-core#174](https://github.com/dbt-labs/dbt-core/pull/174)). In 2017, back when dbt Labs (née Fishtown Analytics) was still a data consultancy, we added support for Snowflake and BigQuery. We also turned dbt's database support into an adapter framework ([dbt-core#259](https://github.com/dbt-labs/dbt-core/pull/259/)), and a plugin system a few years later. For years, dbt Labs specialized in those four data platforms and became experts in them. However, the surface area of all possible databases, their respective nuances, and keeping them up-to-date and bug-free is a Herculean and/or Sisyphean task that couldn't be done by a single person or even a single team! Enter the dbt community which enables dbt Core to work on more than 30 different databases (32 as of Sep '22)! + +Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a _good thing_, however it requires due dilligence that wasn't required in a paid-license, closed-source software world. Before taking a dependency on an open-source projet is is important to determine the answer to the following questions: + +1. Does it work? +2. Does it meet my team's specific use case? +3. Does anyone "own" the code, or is anyone liable for ensuring it works? +4. Do bugs get fixed quickly? +5. Does it stay up-to-date with new Core features? +6. Is the usage substantial enough to self-sustain? +7. What risks do I take on by taking a dependency on this library? + +These are valid, important questions to answer—especially given that `dbt-core` itself only put out its first stable release (major version v1.0) in December 2021! Indeed, up until now, the majority of new user questions in database-specific channels are some form of: + +- "How mature is `dbt-`? Any gotchas I should be aware of before I start exploring?" +- "has anyone here used `dbt-` for production models?" +- "I've been playing with `dbt-` -- I was able to install and run my initial experiments. I noticed that there are certain features mentioned on the documentation that are marked as 'not ok' or 'not tested'. What are the risks? +I'd love to make a statement on my team to adopt DBT [sic], but I'm pretty sure questions will be asked around the possible limitations of the adapter or if there are other companies out there using dbt [sic] with Oracle DB in production, etc." + +There has been a tendency to trust the dbt Labs-maintained adapters over community- and vendor-supported adapters, but repo ownership is only one among many indicators of software quality. We aim to help our users feel well-informed as to the caliber of an adapter with a new program. + +### Verified by dbt Labs + +The adapter verification program aims to quickly indicate to users which adapters can be trusted to use in production. Previously, doing so was uncharted territory for new users and complicated making the business case to their leadership team. We plan to give quality assurances by: + +1. appointing a key stakeholder for the adapter repository, +2. ensuring that the chosen stakeholder fixes bugs and cuts new releases in a timely manner. Refer to the "Maintaining your new adapter" step for more information. +3. demonstrating that it passes our adapter pytest suite tests, +4. assuring that it works for us internally and ideally an existing team using the adapter in production . + +Every major & minor version of a adapter will be verified internally and given an official :white_check_mark: (custom emoji coming soon), on the ["Supported Data Platforms"](/docs/supported-data-platforms) page. + +### How to get an adapter verified? + +We envision that data platform vendors will be most interested in having their adapter versions verified, however we are open to community adapter verification. If interested, please reach out either to the `partnerships` at `dbtlabs.com` or post in the [#adapter-ecosystem Slack channel](https://getdbt.slack.com/archives/C030A0UF5LM). + +## Build a trusted adapter + +The Trusted adapter program exists to allow adapter maintainers to demonstrate to the dbt community that your adapter is trusted to be used in production. + +### What it means to be trusted + +By opting into the below, you agree to this, and we take you at your word. dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met. + +### Feature Completeness + +To be considered for the Trusted Adapter program, the adapter must cover the essential functionality of dbt Core given below, with best effort given to support the entire feature set. + +Essential functionality includes (but is not limited to the following features): + +- table, view, and seed materializations +- dbt tests + +The adapter should have the required documentation for connecting and configuring the adapter. The dbt docs site should be the single source of truth for this information. These docs should be kept up-to-date. + +Proceed to the "Document a new adapter" step for more information. + +### Release Cadence + +Keeping an adapter up-to-date with dbt Core is an integral part of being a trusted adapter. Therefore, we ask that adapter maintainers: + +- Release of new minor versions of the adapter with all tests passing within four weeks of dbt Core's release cut. +- Release of new major versions of the adapter with all tests passing within eight weeks of dbt Core's release cut. + +### Community Responsiveness + +On a best effort basis, active participation and engagement with the dbt Community across the following forums: + +- Being responsive to feedback and supporting user enablement in dbt Community’s Slack workspace +- Responding with comments to issues raised in public dbt adapter code repository +- Merging in code contributions from community members as deemed appropriate + +### Security Practices + +Trusted adapters will not do any of the following: + +- Output to logs or file either access credentials information to or data from the underlying data platform itself. +- Make API calls other than those expressly required for using dbt features (adapters may not add additional logging) +- Obfuscate code and/or functionality so as to avoid detection + +Additionally, to avoid supply-chain attacks: + +- Use an automated service to keep Python dependencies up-to-date (such as Dependabot or similar), +- Publish directly to PyPI from the dbt adapter code repository by using trusted CI/CD process (such as GitHub actions) +- Restrict admin access to both the respective code (GitHub) and package (PyPI) repositories +- Identify and mitigate security vulnerabilities by use of a static code analyzing tool (such as Snyk) as part of a CI/CD process + +### Other considerations + +The adapter repository is: + +- open-souce licensed, +- published to PyPI, and +- automatically tests the codebase against dbt Lab's provided adapter test suite + +### How to get an adapter verified + +Open an issue on the [docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com) using the "Add adapter to Trusted list" template. In addition to contact information, it will ask confirm that you agree to the following. + +1. my adapter meet the guidelines given above +2. I will make best reasonable effort that this continues to be so +3. checkbox: I acknowledge that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the above guidelines not be met. + +The approval workflow is as follows: + +1. create and populate the template-created issue +2. dbt Labs will respond as quickly as possible (maximally four weeks, though likely faster) +3. If approved, dbt Labs will create and merge a Pull request to formally add the adapter to the list. + +### Getting help for my trusted adapter + +Ask your question in #adapter-ecosystem channel of the dbt community Slack. diff --git a/website/docs/guides/airflow-and-dbt-cloud.md b/website/docs/guides/airflow-and-dbt-cloud.md new file mode 100644 index 00000000000..a3ff59af14e --- /dev/null +++ b/website/docs/guides/airflow-and-dbt-cloud.md @@ -0,0 +1,296 @@ +--- +title: Airflow and dbt Cloud +id: airflow-and-dbt-cloud +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['dbt Cloud', 'Orchestration'] +level: 'Intermediate' +recently_updated: true +--- + +## Introduction + +In some cases, [Airflow](https://airflow.apache.org/) may be the preferred orchestrator for your organization over working fully within dbt Cloud. There are a few reasons your team might be considering using Airflow to orchestrate your dbt jobs: + +- Your team is already using Airflow to orchestrate other processes +- Your team needs to ensure that a [dbt job](https://docs.getdbt.com/docs/dbt-cloud/cloud-overview#schedule-and-run-dbt-jobs-in-production) kicks off before or after another process outside of dbt Cloud +- Your team needs flexibility to manage more complex scheduling, such as kicking off one dbt job only after another has completed +- Your team wants to own their own orchestration solution +- You need code to work right now without starting from scratch + +### Prerequisites + +- [dbt Cloud Teams or Enterprise account](https://www.getdbt.com/pricing/) (with [admin access](https://docs.getdbt.com/docs/cloud/manage-access/enterprise-permissions)) in order to create a service token. Permissions for service tokens can be found [here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens). +- A [free Docker account](https://hub.docker.com/signup) in order to sign in to Docker Desktop, which will be installed in the initial setup. +- A local digital scratchpad for temporarily copy-pasting API keys and URLs + +### Airflow + dbt Core + +There are [so many great examples](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py) from GitLab through their open source data engineering work. This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/). + +### Airflow + dbt Cloud API w/Custom Scripts + +This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available [here](https://registry.astronomer.io/providers/dbt%20Cloud/versions/latest). + +There are many different permutations of this over time: + +- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on [custom python API utilities](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py) +- [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want +- For more options, check out the [official dbt Docs](/docs/deploy/deployments#airflow) on the various ways teams are running dbt in airflow + +These solutions are great, but can be difficult to trust as your team grows and management for things like: testing, job definitions, secrets, and pipelines increase past your team’s capacity. Roles become blurry (or were never clearly defined at the start!). Both data and analytics engineers start digging through custom logging within each other’s workflows to make heads or tails of where and what the issue really is. Not to mention that when the issue is found, it can be even harder to decide on the best path forward for safely implementing fixes. This complex workflow and unclear delineation on process management results in a lot of misunderstandings and wasted time just trying to get the process to work smoothly! + + +In this guide, you'll learn how to: + +1. Creating a working local Airflow environment +2. Invoking a dbt Cloud job with Airflow (with proof!) +3. Reusing tested and trusted Airflow code for your specific use cases + +You’ll also gain a better understanding of how this will: + +- Reduce the cognitive load when building and maintaining pipelines +- Avoid dependency hell (think: `pip install` conflicts) +- Implement better recoveries from failures +- Define clearer workflows so that data and analytics engineers work better, together ♥️ + + +🙌 Let’s get started! 🙌 + +## Install the Astro CLI + +Astro is a managed software service that includes key features for teams working with Airflow. In order to use Astro, we’ll install the Astro CLI, which will give us access to useful commands for working with Airflow locally. You can read more about Astro [here](https://docs.astronomer.io/astro/). + +In this example, we’re using Homebrew to install Astro CLI. Follow the instructions to install the Astro CLI for your own operating system [here](https://docs.astronomer.io/astro/install-cli). + +```bash +brew install astro +``` + + + +## Install and start Docker Desktop + +Docker allows us to spin up an environment with all the apps and dependencies we need for the example. + +Follow the instructions [here](https://docs.docker.com/desktop/) to install Docker desktop for your own operating system. Once Docker is installed, ensure you have it up and running for the next steps. + + + +## Clone the airflow-dbt-cloud repository + +Open your terminal and clone the [airflow-dbt-cloud repository](https://github.com/sungchun12/airflow-dbt-cloud.git). This contains example Airflow DAGs that you’ll use to orchestrate your dbt Cloud job. Once cloned, navigate into the `airflow-dbt-cloud` project. + +```bash +git clone https://github.com/sungchun12/airflow-dbt-cloud.git +cd airflow-dbt-cloud +``` + + + +## Start the Docker container + +You can initialize an Astronomer project in an empty local directory using a Docker container, and then run your project locally using the `start` command. + +1. Run the following commands to initialize your project and start your local Airflow deployment: + + ```bash + astro dev init + astro dev start + ``` + + When this finishes, you should see a message similar to the following: + + ```bash + Airflow is starting up! This might take a few minutes… + + Project is running! All components are now available. + + Airflow Webserver: http://localhost:8080 + Postgres Database: localhost:5432/postgres + The default Airflow UI credentials are: admin:admin + The default Postrgres DB credentials are: postgres:postgres + ``` + +2. Open the Airflow interface. Launch your web browser and navigate to the address for the **Airflow Webserver** from your output in Step 1. + + This will take you to your local instance of Airflow. You’ll need to log in with the **default credentials**: + + - Username: admin + - Password: admin + + ![Airflow login screen](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-login.png) + + + +## Create a dbt Cloud service token + +Create a service token from within dbt Cloud using the instructions [found here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Ensure that you save a copy of the token, as you won’t be able to access this later. In this example we use `Account Admin`, but you can also use `Job Admin` instead for token permissions. + + + +## Create a dbt Cloud job + +In your dbt Cloud account create a job, paying special attention to the information in the bullets below. Additional information for creating a dbt Cloud job can be found [here](/guides/bigquery). + +- Configure the job with the commands that you want to include when this job kicks off, as Airflow will be referring to the job’s configurations for this rather than being explicitly coded in the Airflow DAG. This job will run a set of commands rather than a single command. +- Ensure that the schedule is turned **off** since we’ll be using Airflow to kick things off. +- Once you hit `save` on the job, make sure you copy the URL and save it for referencing later. The url will look similar to this: + +```html +https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/ +``` + + + +## Add your dbt Cloud API token as a secure connection + + + +Now you have all the working pieces to get up and running with Airflow + dbt Cloud. Let’s dive into make this all work together. We will **set up a connection** and **run a DAG in Airflow** that kicks off a dbt Cloud job. + +1. Navigate to Admin and click on **Connections** + + ![Airflow connections menu](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-connections-menu.png) + +2. Click on the `+` sign to add a new connection, then click on the drop down to search for the dbt Cloud Connection Type + + ![Create connection](/img/guides/orchestration/airflow-and-dbt-cloud/create-connection.png) + + ![Connection type](/img/guides/orchestration/airflow-and-dbt-cloud/connection-type.png) + +3. Add in your connection details and your default dbt Cloud account id. This is found in your dbt Cloud URL after the accounts route section (`/accounts/{YOUR_ACCOUNT_ID}`), for example the account with id 16173 would see this in their URL: `https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/` + +![https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2](https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2) + +## Add your `job_id` and `account_id` config details to the python file + + Add your `job_id` and `account_id` config details to the python file: [dbt_cloud_provider_eltml.py](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/dags/dbt_cloud_provider_eltml.py). + +1. You’ll find these details within the dbt Cloud job URL, see the comments in the code snippet below for an example. + + ```python + # dbt Cloud Job URL: https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/ + # account_id: 16173 + #job_id: 65767 + + # line 28 + default_args={"dbt_cloud_conn_id": "dbt_cloud", "account_id": 16173}, + + trigger_dbt_cloud_job_run = DbtCloudRunJobOperator( + task_id="trigger_dbt_cloud_job_run", + job_id=65767, # line 39 + check_interval=10, + timeout=300, + ) + ``` + +2. Turn on the DAG and verify the job succeeded after running. Note: screenshots taken from different job runs, but the user experience is consistent. + + ![https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy](https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy) + + ![Airflow DAG](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-dag.png) + + ![Task run instance](/img/guides/orchestration/airflow-and-dbt-cloud/task-run-instance.png) + + ![https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS](https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS) + +## How do I rerun the dbt Cloud job and downstream tasks in my pipeline? + +If you have worked with dbt Cloud before, you have likely encountered cases where a job fails. In those cases, you have likely logged into dbt Cloud, investigated the error, and then manually restarted the job. + +This section of the guide will show you how to restart the job directly from Airflow. This will specifically run *just* the `trigger_dbt_cloud_job_run` and downstream tasks of the Airflow DAG and not the entire DAG. If only the transformation step fails, you don’t need to re-run the extract and load processes. Let’s jump into how to do that in Airflow. + +1. Click on the task + + ![Task DAG view](/img/guides/orchestration/airflow-and-dbt-cloud/task-dag-view.png) + +2. Clear the task instance + + ![Clear task instance](/img/guides/orchestration/airflow-and-dbt-cloud/clear-task-instance.png) + + ![Approve clearing](/img/guides/orchestration/airflow-and-dbt-cloud/approve-clearing.png) + +3. Watch it rerun in real time + + ![Re-run](/img/guides/orchestration/airflow-and-dbt-cloud/re-run.png) + +## Cleaning up + +At the end of this guide, make sure you shut down your docker container. When you’re done using Airflow, use the following command to stop the container: + +```bash +$ astrocloud dev stop + +[+] Running 3/3 + ⠿ Container airflow-dbt-cloud_e3fe3c-webserver-1 Stopped 7.5s + ⠿ Container airflow-dbt-cloud_e3fe3c-scheduler-1 Stopped 3.3s + ⠿ Container airflow-dbt-cloud_e3fe3c-postgres-1 Stopped 0.3s +``` + +To verify that the deployment has stopped, use the following command: + +```bash +astrocloud dev ps +``` + +This should give you an output like this: + +```bash +Name State Ports +airflow-dbt-cloud_e3fe3c-webserver-1 exited +airflow-dbt-cloud_e3fe3c-scheduler-1 exited +airflow-dbt-cloud_e3fe3c-postgres-1 exited +``` + + + +## Frequently asked questions + +### How can we run specific subsections of the dbt DAG in Airflow? + +Because of the way we configured the dbt Cloud job to run in Airflow, you can leave this job to your analytics engineers to define in the job configurations from dbt Cloud. If, for example, we need to run hourly-tagged models every hour and daily-tagged models daily, we can create jobs like `Hourly Run` or `Daily Run` and utilize the commands `dbt run -s tag:hourly` and `dbt run -s tag:daily` within each, respectively. We only need to grab our dbt Cloud `account` and `job id`, configure it in an Airflow DAG with the code provided, and then we can be on your way. See more node selection options: [here](/reference/node-selection/syntax) + +### How can I re-run models from the point of failure? + +You may want to parse the dbt DAG in Airflow to get the benefit of re-running from the point of failure. However, when you have hundreds of models in your DAG expanded out, it becomes useless for diagnosis and rerunning due to the overhead that comes along with creating an expansive Airflow DAG. + +You can’t re-run from failure natively in dbt Cloud today (feature coming!), but you can use a custom rerun parser. + +Using a simple python script coupled with the dbt Cloud provider, you can: + +- Avoid managing artifacts in a separate storage bucket(dbt Cloud does this for you) +- Avoid building your own parsing logic +- Get clear logs on what models you're rerunning in dbt Cloud (without hard coding step override commands) + +Watch the video below to see how it works! + + + +### Should Airflow run one big dbt job or many dbt jobs? + +Overall we recommend being as purposeful and minimalistic as you can. This is because dbt manages all of the dependencies between models and the orchestration of running those dependencies in order, which in turn has benefits in terms of warehouse processing efforts. + +### We want to kick off our dbt jobs after our ingestion tool (such as Fivetran) / data pipelines are done loading data. Any best practices around that? + +Our friends at Astronomer answer this question with this example: [here](https://registry.astronomer.io/dags/fivetran-dbt-cloud-census) + +### How do you set up a CI/CD workflow with Airflow? + +Check out these two resources for accomplishing your own CI/CD pipeline: + +- [Continuous Integration with dbt Cloud](/docs/deploy/continuous-integration) +- [Astronomer's CI/CD Example](https://docs.astronomer.io/software/ci-cd/#example-cicd-workflow) + +### Can dbt dynamically create tasks in the DAG like Airflow can? + +We prefer to keep models bundled vs. unbundled. You can go this route, but if you have hundreds of dbt models, it’s more effective to let the dbt Cloud job handle the models and dependencies. Bundling provides the solution to clear observability when things go wrong - we've seen more success in having the ability to clearly see issues in a bundled dbt Cloud job than combing through the nodes of an expansive Airflow DAG. If you still have a use case for this level of control though, our friends at Astronomer answer this question [here](https://www.astronomer.io/blog/airflow-dbt-1/)! + +### Can you trigger notifications if a dbt job fails with Airflow? Is there any way to access the status of the dbt Job to do that? + +Yes, either through [Airflow's email/slack](https://www.astronomer.io/guides/error-notifications-in-airflow/) functionality by itself or combined with [dbt Cloud's notifications](/docs/deploy/job-notifications), which support email and slack notifications. + +### Are there decision criteria for how to best work with dbt Cloud and airflow? + +Check out this deep dive into planning your dbt Cloud + Airflow implementation [here](https://www.youtube.com/watch?v=n7IIThR8hGk)! diff --git a/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md b/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md deleted file mode 100644 index 844c895af98..00000000000 --- a/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "dbt Cloud environment best practices" -id: 1-env-guide-overview -description: Learn how to configure environments in dbt Cloud. -displayText: "dbt Cloud environment best practices" -hoverSnippet: Learn how to configure environments in dbt Cloud. ---- - -> *How do I manage environments in my dbt Cloud project? How many do I need?* -> -> *How does my structure map to environments in dbt Cloud?* -> -> *What do git branches have to do with my dbt Cloud environments?* -> - -If these questions keep you up at night, you’ve come to the right place! When it comes to managing your dbt Cloud environments, there is not a one-size-fits-all solution for all teams. In this guide we’ll walk you through a few environment architecture options for dbt Cloud that we’d recommend, and hopefully you find an option that works for you. - -## Learning goals - -This guide has three main goals: - -- Provide our recommendations on managing dbt Cloud environments -- Illustrate these recommendations with comprehensive examples -- At each stage, explain *why* we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs - -:::info -☁️ This guide focuses on architecture for **dbt Cloud**. However, similar principles apply for developers using dbt Core. Before diving into this guide we recommend taking a look at our **[dbt Cloud environments](/docs/dbt-cloud-environments)** page for more context. - -::: - -### How many environments do I really need? - -Environments define the way that dbt will execute your code, including: - -- The **version of dbt** that will run. -- The **version of your code** to be executed. -- The **connection information** for your warehouse. -- In dbt Cloud, there are **two types of environments:** - - **Development** — the environment settings in which you work in the IDE on a development branch. - - **Deployment** — the environment settings in which a dbt Cloud job runs. - -In this guide, we’re going to focus on **deployment environments**, which determine how your project is executed when a **dbt Cloud job executes**. When using both approaches, make sure to designate one environment as "Production." This will allow you to use features such as dbt Explorer and cross-project references. Refer to [Set product environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) for details. - -Depending on your git workflow and testing strategy, you'll be choosing between one deployment environment or many deployment environments. We provide a high-level overview of how these two deployment strategies work here, but use each section of this guide to get a deep-dive into how these setups differ. - -| Setup option | Works well if you | Relative complexity level | -| --- | --- | --- | -| One deployment environment | - only scheduled runs for one set of data objects
      - development branches are merged directly to main | Low | -| Many deployment environments | - feature branches move through several promotion stages | High | - -### TL;DR — One deployment environment - -We usually recommended folks start with the basics; having one deployment environment is usually the simplest and most maintainable approach to start. This approach works well if: - -- You only need to have **scheduled jobs running in a single environment** within your data warehouse. -- You use a **single primary branch** and follow a direct promotion (**Dev —> Prod**) strategy - -With this option, your production jobs and your [Slim CI jobs](/docs/deploy/continuous-integration) that ensure code integrity are managed within one single deployment environment. - -### TL;DR — Many deployment environments -This approach adds a bit more complexity and may slow down the development process, but adds a layer of security that can be worth the tradeoff. This approach works well if: - -- Your organization maintains **several long-lived git branches** to control how and when changes are tested and promoted to production. - - Some orgs follow a **Dev —> QA —> Prod release cycle** — if that sounds like your org, this approach is probably right for you. -- The **output of your dbt project is an input to other systems** and you need to test and validate many changes on a stable, long-lived staging dataset in a pre-production environment. - -The two options are explored in more detail in the following sections, including the benefits, trade-offs, the steps required to implement the setup in dbt Cloud. diff --git a/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md b/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md deleted file mode 100644 index 89bb05e7c75..00000000000 --- a/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: "One deployment environment" -id: 2-one-deployment-environment -description: Learn how to configure a single deployment environment setup in dbt Cloud. -displayText: "dbt Cloud environment best practices" -hoverSnippet: Learn how to configure a single deployment environment setup in dbt Cloud. ---- -import ExpNote from '/snippets/_explorer-beta-note.md'; - - -## What this looks like - -1. You have a **single *development* environment** where dbt users can access the dbt Cloud IDE and make changes to their code on feature branches created off of your default branch in your repository (most often the `main` branch). -2. You have a **single *deployment* environment** (let’s call it “Production”) where your scheduled jobs run referencing the `main` branch.
      - - - -3. You also have a [**Slim CI job**](/docs/deploy/continuous-integration) that kicks off anytime you open a PR to merge a feature branch into `main`. This Slim CI job can run in your dbt “Production” environment. - -:::info - -☁️ Slim CI jobs run in a dedicated custom schema for each PR, so there will no collision with your production schemas. - -::: - - - -### Git workflow - - - - -1. In the dbt Cloud IDE, developers work on feature branches, created from the `main` branch (`feature_a`, `feature_b`, `feature_c` above) -2. When code is ready, developer opens a PR to merge feature branch into `main` -3. [**Slim CI Job**](/docs/deploy/continuous-integration) automatically kicks off, and tests the changes made in the PR -4. When Slim CI Job is successful and team is ready to deploy changes to Production, the PR is merged directly into the `main` branch. The next time a production job runs, these changes will be incorporated and executed. - -### dbt Cloud setup - -1. Create your [**development environment**](/docs/dbt-cloud-environments) to power the dbt Cloud IDE. No extra customization needed! -2. Create your **[production deployment environment](/docs/deploy/deploy-environments)**. -3. Define your **dbt Cloud jobs** in the production deployment environment from step 2. - 1. **Production job(s)**: You will need to set up **at least one scheduled job** that deploys your project to your production databases/schemas. You may create multiple jobs based on your business SLAs. - 2. **Slim CI Job**: Unlike the production jobs, which are triggered via the scheduler, this job will be triggered when PRs are opened in your repository. Refer to [Slim CI jobs](/docs/deploy/slim-ci-jobs) for details. - - -### When this works well - -This approach is recommended for most use cases because it enables you to quickly and safely implement code changes in the production environment. It also gives developers the confidence to trust and rely on these changes. With this option, multiple developers can easily contribute to and collaborate on the same codebase with confidence. - -:::info -💡 Check out [Sunrun's Coalesce 2022 talk](https://www.youtube.com/watch?v=vmBAO2XN-fM) on Automating CI/CD in dbt Cloud, where they simplified their CI/CD process from several long-lived branches to a single long-lived main branch with feature branches. - -::: - -### When this doesn’t work so well - -- You have a **formal QA process** before merging code into production. -- You want to **control when features are released** to production. -- You need to have scheduled **jobs running in many environments** due to dependencies on outside systems. - - e.g. Your organization has many applications that consume and test data changes in a lower non-Production environment before changes should be promoted to Production. diff --git a/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md b/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md deleted file mode 100644 index cb882d4ac1b..00000000000 --- a/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md +++ /dev/null @@ -1,77 +0,0 @@ ---- -title: "Many deployment environments" -id: 3-many-deployment-environments -description: Learn how to configure a many deployment environment setup in dbt Cloud. -displayText: "dbt Cloud environment best practices" -hoverSnippet: Learn how to configure a many deployment environment setup in dbt Cloud. ---- -import ExpNote from '/snippets/_explorer-beta-note.md'; - -## What this looks like - -1. You have a **single *development* environment** where dbt users can access the dbt Cloud IDE and make changes to their code. However, you’ll want to update the **[custom branch settings](faqs/Environments/custom-branch-settings)** to ensure that developers create feature branches off of the a non-production branch. For this example, we’ll refer to this as the `qa` branch. -2. You have a **QA deployment environment**, running scheduled jobs from the `qa` branch that deploys your dbt project to a pre-production warehouse location. -3. You have a **Production deployment environment,** running scheduled jobs from the `main` branch that deploys your dbt project to your production warehouse location.
      - - - -4. You have **multiple Slim CI jobs** (one in each deployment environment) to ensure changes to each branch are tested. - - - -### Git workflow - - - -1. In the dbt Cloud IDE, developers work on feature branches, **created from the `qa` branch** (`feature_a`, `feature_b`, `feature_c` above). -2. When code is ready, developer opens a PR to merge feature branch into `qa`. -3. The **first Slim CI Job** automatically kicks off to test the changes introduced in the PR. This job will *defer to a regularly-scheduled job in the QA environment* and run in the QA deployment environment. -4. When **Slim CI Job is successful** and team is ready to deploy changes, the **PR is merged into `qa`.** -5. Scheduled jobs run in the QA deployment environment, running on `qa` branch to ensure the new changes work as intended. -6. When **all feature branches** for a given release (e.g. sprint) have been **successfully merged** to `qa` and are **running without error** in the QA deployment environment, a team member opens a **PR to merge `qa` → `main`.** -7. The **second Slim CI Job** automatically kicks off to test changes in PR. This job will *defer to a regularly-scheduled job in the Production environment* and run in the Production deployment environment. -8. When **second Slim CI Job** is successful and team is ready to deploy changes, the **PR is merged into `main`**. -9. Monitor scheduled jobs in the Production deployment environment that are running on `main` branch. Voila! All changes are released and ready for your stakeholders. - -:::info -💡 Considering a different branching strategy that involves cherry picking? [Maybe reconsider!](https://docs.getdbt.com/blog/the-case-against-git-cherry-picking) - -::: - -### dbt Cloud setup - -1. Create your [**development environment**](/docs/dbt-cloud-environments) to power the dbt Cloud IDE. - - Here, we’ll set a **custom branch** so that users in the IDE create their feature branches from `qa` instead of `main`. Click **Only run on a custom branch** in **General settings**, enter `qa` into **Custom Branch.** - -2. Set up your **QA [deployment environment](/docs/deploy/deploy-environments)** - - Here, we’ll apply the same custom branch settings as the development environment in Step 1. All scheduled jobs in the QA deployment environment will use the code from the `qa` branch during execution. - -3. **Define QA jobs** - 1. **QA job(s)**: You’ll want to create at least one scheduled job, running on a roughly daily cadence. This will allow us to make sure all the code executes without error before you release it to production, and will also power the first Slim CI job. - 2. **Slim CI Job**: As above, this job will be triggered when PRs are opened in your repository. Enable this option by selecting **Run on Pull Requests?** under the **Continuous Integration(CI)** tab under the **Triggers** section. Since we’re using the custom branch setting in the QA environment, you'll also want to be sure to select the second option **Run only on Custom Branch** (selected by default) — this means that only PRs created against the `qa` branch will trigger this job, rather than any PR at all. - - This job will also need to defer to one of the QA jobs created in step 3a. This enables the use of the `state` modifier in your selection syntax to only run changes introduced by your PR. - -4. Set up your **Production [deployment environment](/docs/deploy/deploy-environments)** - - Here, we’ll *also* use the same custom branch settings as the other environments, but set the custom branch as `main`. Even thought the `main` branch is the default, setting this value enables us to properly set up the CI Job in the next step. - -5. **Define production jobs** - 1. **Production job(s)**: You will need to set up at least one scheduled job that deploys your project to your production databases/schemas. You may create multiple jobs based on your business SLAs. - 2. **Production Slim CI Job**: As above, this job will be triggered when PRs are opened in your repository. Enable this option by selecting **Run on Pull Requests?** under the **Continuous Integration(CI)** tab under the **Triggers** section. Since we’re using the custom branch setting in the QA environment, we’ll also want to select the second option **Run only on Custom Branch** — this means that only PRs created against the `main` branch will trigger this job, rather than any PR at all. - - This job will also need to defer to one of the QA jobs created in step 5a. This enables the use of the `state` modifier in your selection syntax to only run changes introduced by your PR. - -### When this works well - -This approach works well when it’s critical to **apply user acceptance and integration testing to your project in a pre-production environment**. This approach allows you to have scheduled jobs running in **many environments** on your data warehouse. - -### When this doesn’t work so well - -This approach may slow down the time it takes to get new feature into production, since it requires additional steps in the deployment process and additional branches to maintain. Keep in mind that adding complexity to your deployment process might cause some slowdown in your release cycle. - -## Conclusion - -While there’s no single correct answer to how to setup your dbt Cloud environments, they are flexible enough to enable just about any code promotion workflow your organization uses. We would love to hear how you’ve set up your deployment infrastructure in dbt Cloud! diff --git a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md b/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md deleted file mode 100644 index 22f8e36190a..00000000000 --- a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Now it's your turn -id: 6-how-we-style-conclusion ---- - -## BYO Styles - -Now that you've seen how we style our dbt projects, it's time to build your own. Feel free to copy this guide and use it as a template for your own project. If you do, we'd love to hear about it! Reach out to us on [the Community Forum](https://discourse.getdbt.com/c/show-and-tell/22) or [Slack](https://www.getdbt.com/community) to share your style guide. We recommend co-locating your style guide with your code to make sure contributors can easily follow it. If you're using GitHub, you can add your style guide to your repository's wiki, or include it in your README. - -## Pre-commit hooks - -Lastly, to ensure your style guide's automated rules are being followed without additional mental overhead to your team, you can use [pre-commit hooks](https://pre-commit.com/) to automatically check your code for style violations (and often fix them automagically) before it's committed. This is a great way to make sure your style guide is followed by all contributors. We recommend implementing this once you've settled on and published your style guide, and your codebase is conforming to it. This will ensure that all future commits follow the style guide. You can find an excellent set of open source pre-commit hooks for dbt from the community [here in the dbt-checkpoint project](https://github.com/dbt-checkpoint/dbt-checkpoint). diff --git a/website/docs/quickstarts/bigquery-qs.md b/website/docs/guides/bigquery-qs.md similarity index 98% rename from website/docs/quickstarts/bigquery-qs.md rename to website/docs/guides/bigquery-qs.md index 84e3b3ae545..c1f632f0621 100644 --- a/website/docs/quickstarts/bigquery-qs.md +++ b/website/docs/guides/bigquery-qs.md @@ -1,10 +1,12 @@ --- title: "Quickstart for dbt Cloud and BigQuery" id: "bigquery" -time_to_complete: '30 minutes' -platform: 'dbt-cloud' +# time_to_complete: '30 minutes' commenting out until we test +level: 'Beginner' icon: 'bigquery' hide_table_of_contents: true +tags: ['BigQuery', 'dbt Cloud','Quickstart'] +recently_updated: true --- ## Introduction @@ -33,8 +35,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen ### Related content - Learn more with [dbt Courses](https://courses.getdbt.com/collections) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) @@ -73,7 +75,6 @@ In order to let dbt connect to your warehouse, you'll need to generate a keyfile 1. Start the [GCP credentials wizard](https://console.cloud.google.com/apis/credentials/wizard). Make sure your new project is selected in the header. If you do not see your account or project, click your profile picture to the right and verify you are using the correct email account. For **Credential Type**: - From the **Select an API** dropdown, choose **BigQuery API** - Select **Application data** for the type of data you will be accessing - - Select **No, I’m not using them** and click **Next**. - Click **Next** to create a new service account. 2. Create a service account for your new project from the [Service accounts page](https://console.cloud.google.com/projectselector2/iam-admin/serviceaccounts?supportedpurview=project). For more information, refer to [Create a service account](https://developers.google.com/workspace/guides/create-credentials#create_a_service_account) in the Google Cloud docs. As an example for this guide, you can: - Type `dbt-user` as the **Service account name** diff --git a/website/docs/guides/legacy/building-packages.md b/website/docs/guides/building-packages.md similarity index 88% rename from website/docs/guides/legacy/building-packages.md rename to website/docs/guides/building-packages.md index 2a6803334d4..641a1c6af6d 100644 --- a/website/docs/guides/legacy/building-packages.md +++ b/website/docs/guides/building-packages.md @@ -1,26 +1,38 @@ --- -title: "Building a dbt package" # to do: update this to creating -id: "building-packages" +title: Building dbt packages +id: building-packages +description: "When you have dbt code that might help others, you can create a package for dbt using a GitHub repository." +displayText: Building dbt packages +hoverSnippet: Learn how to create packages for dbt. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['dbt Core'] +level: 'Advanced' +recently_updated: true --- -## Assumed knowledge -This article assumes you are familiar with: +## Introduction + +Creating packages is an **advanced use of dbt**. If you're new to the tool, we recommend that you first use the product for your own analytics before attempting to create a package for others. + +### Prerequisites + +A strong understanding of: - [packages](/docs/build/packages) - administering a repository on GitHub - [semantic versioning](https://semver.org/) -Heads up — developing a package is an **advanced use of dbt**. If you're new to the tool, we recommend that you first use the product for your own company's analytics before attempting to create a package. - -## 1. Assess whether a package is the right solution +### Assess whether a package is the right solution Packages typically contain either: - macros that solve a particular analytics engineering problem — for example, [auditing the results of a query](https://hub.getdbt.com/dbt-labs/audit_helper/latest/), [generating code](https://hub.getdbt.com/dbt-labs/codegen/latest/), or [adding additional schema tests to a dbt project](https://hub.getdbt.com/calogica/dbt_expectations/latest/). - models for a common dataset — for example a dataset for software products like [MailChimp](https://hub.getdbt.com/fivetran/mailchimp/latest/) or [Snowplow](https://hub.getdbt.com/dbt-labs/snowplow/latest/), or even models for metadata about your data stack like [Snowflake query spend](https://hub.getdbt.com/gitlabhq/snowflake_spend/latest/) and [the artifacts produced by `dbt run`](https://hub.getdbt.com/tailsdotcom/dbt_artifacts/latest/). In general, there should be a shared set of industry-standard metrics that you can model (e.g. email open rate). Packages are _not_ a good fit for sharing models that contain business-specific logic, for example, writing code for marketing attribution, or monthly recurring revenue. Instead, consider sharing a blog post and a link to a sample repo, rather than bundling this code as a package (here's our blog post on [marketing attribution](https://blog.getdbt.com/modeling-marketing-attribution/) as an example). -## 2. Create your new project -:::note Using the CLI for package development -We tend to use the CLI for package development. The development workflow often involves installing a local copy of your package in another dbt project — at present dbt Cloud is not designed for this workflow. +## Create your new project +:::note Using the command line for package development +We tend to use the command line interface for package development. The development workflow often involves installing a local copy of your package in another dbt project — at present dbt Cloud is not designed for this workflow. ::: 1. Use the [dbt init](/reference/commands/init) command to create a new dbt project, which will be your package: @@ -33,15 +45,15 @@ $ dbt init [package_name] ¹Currently, our package registry only supports packages that are hosted in GitHub. -## 3. Develop your package +## Develop your package We recommend that first-time package authors first develop macros and models for use in their own dbt project. Once your new package is created, you can get to work on moving them across, implementing some additional package-specific design patterns along the way. When working on your package, we often find it useful to install a local copy of the package in another dbt project — this workflow is described [here](https://discourse.getdbt.com/t/contributing-to-an-external-dbt-package/657). -### Follow our best practices +### Follow best practices _Modeling packages only_ -Use our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md), our article on [how we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), and our [best practices](best-practices) for all of our advice on how to build your dbt project. +Use our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md), our article on [how we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview), and our [best practices](best-practices) for all of our advice on how to build your dbt project. This is where it comes in especially handy to have worked on your own dbt project previously. @@ -103,7 +115,7 @@ Over time, we've developed a set of useful GitHub artifacts that make administer - Descriptions of the main models included in the package ([example](https://github.com/dbt-labs/snowplow)) - GitHub templates, including PR templates and issue templates ([example](https://github.com/dbt-labs/dbt-audit-helper/tree/master/.github)) -## 4. Add integration tests +## Add integration tests _Optional_ We recommend that you implement integration tests to confirm that the package works as expected — this is an even _more_ advanced step, so you may find that you build up to this. @@ -125,7 +137,7 @@ packages: -4. Add resources to the package (seeds, models, tests) so that you can successfully run your project, and compare the output with what you expect. The exact appraoch here will vary depending on your packages. In general you will find that you need to: +4. Add resources to the package (seeds, models, tests) so that you can successfully run your project, and compare the output with what you expect. The exact approach here will vary depending on your packages. In general you will find that you need to: - Add mock data via a [seed](/docs/build/seeds) with a few sample (anonymized) records. Configure the `integration_tests` project to point to the seeds instead of raw data tables. - Add more seeds that represent the expected output of your models, and use the [dbt_utils.equality](https://github.com/dbt-labs/dbt-utils#equality-source) test to confirm the output of your package, and the expected output matches. @@ -134,7 +146,7 @@ packages: 5. (Optional) Use a CI tool, like CircleCI or GitHub Actions, to automate running your dbt project when you open a new Pull Request. For inspiration, check out one of our [CircleCI configs](https://github.com/dbt-labs/snowplow/blob/main/.circleci/config.yml), which runs tests against our four main warehouses. Note: this is an advanced step — if you are going down this path, you may find it useful to say hi on [dbt Slack](https://community.getdbt.com/). -## 5. Deploy the docs for your package +## Deploy the docs for your package _Optional_ A dbt docs site can help a prospective user of your package understand the code you've written. As such, we recommend that you deploy the site generated by `dbt docs generate` and link to the deployed site from your package. @@ -147,12 +159,13 @@ The easiest way we've found to do this is to use [GitHub Pages](https://pages.gi 4. Enable GitHub pages on the repo in the settings tab, and point it to the “docs” subdirectory 4. GitHub should then deploy the docs at `.github.io/`, like so: [fivetran.github.io/dbt_ad_reporting](https://fivetran.github.io/dbt_ad_reporting/) -## 6. Release your package +## Release your package Create a new [release](https://docs.github.com/en/github/administering-a-repository/managing-releases-in-a-repository) once you are ready for others to use your work! Be sure to use [semantic versioning](https://semver.org/) when naming your release. In particular, if new changes will cause errors for users of earlier versions of the package, be sure to use _at least_ a minor release (e.g. go from `0.1.1` to `0.2.0`). The release notes should contain an overview of the changes introduced in the new version. Be sure to call out any changes that break the existing interface! -## 7. Add the package to hub.getdbt.com +## Add the package to hub.getdbt.com + Our package registry, [hub.getdbt.com](https://hub.getdbt.com/), gets updated by the [hubcap script](https://github.com/dbt-labs/hubcap). To add your package to hub.getdbt.com, create a PR on the [hubcap repository](https://github.com/dbt-labs/hubcap) to include it in the `hub.json` file. diff --git a/website/docs/quickstarts/codespace-qs.md b/website/docs/guides/codespace-qs.md similarity index 93% rename from website/docs/quickstarts/codespace-qs.md rename to website/docs/guides/codespace-qs.md index 3cd048c97a4..7712ed8f8e8 100644 --- a/website/docs/quickstarts/codespace-qs.md +++ b/website/docs/guides/codespace-qs.md @@ -1,9 +1,11 @@ --- -title: "Quickstart for dbt Core using GitHub Codespaces" +title: Quickstart for dbt Core using GitHub Codespaces id: codespace platform: 'dbt-core' icon: 'fa-github' +level: 'Beginner' hide_table_of_contents: true +tags: ['dbt Core','Quickstart'] --- ## Introduction @@ -19,10 +21,10 @@ dbt Labs provides a [GitHub Codespace](https://docs.github.com/en/codespaces/ove ## Related content -- [Create a GitHub repository](/quickstarts/manual-install?step=2) -- [Build your first models](/quickstarts/manual-install?step=3) -- [Test and document your project](/quickstarts/manual-install?step=4) -- [Schedule a job](/quickstarts/manual-install?step=5) +- [Create a GitHub repository](/guides/manual-install?step=2) +- [Build your first models](/guides/manual-install?step=3) +- [Test and document your project](/guides/manual-install?step=4) +- [Schedule a job](/guides/manual-install?step=5) - Learn more with [dbt Courses](https://courses.getdbt.com/collections) ## Create a codespace diff --git a/website/docs/guides/advanced/creating-new-materializations.md b/website/docs/guides/create-new-materializations.md similarity index 95% rename from website/docs/guides/advanced/creating-new-materializations.md rename to website/docs/guides/create-new-materializations.md index d3081ea8e20..1ad7d202de6 100644 --- a/website/docs/guides/advanced/creating-new-materializations.md +++ b/website/docs/guides/create-new-materializations.md @@ -1,12 +1,18 @@ --- -title: "Creating new materializations" -id: "creating-new-materializations" +title: "Create new materializations" +id: create-new-materializations description: Learn how to create your own materializations. displayText: Creating new materializations hoverSnippet: Learn how to create your own materializations. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['dbt Core'] +level: 'Advanced' +recently_updated: true --- -## Overview +## Introduction The model materializations you're familiar with, `table`, `view`, and `incremental` are implemented as macros in a package that's distributed along with dbt. You can check out the [source code for these materializations](https://github.com/dbt-labs/dbt-core/tree/main/core/dbt/include/global_project/macros/materializations). If you need to create your own materializations, reading these files is a good place to start. Continue reading below for a deep-dive into dbt materializations. @@ -110,13 +116,6 @@ Be sure to `commit` the transaction in the `cleanup` phase of the materializatio ### Update the Relation cache - -:::info New in 0.15.0 - -The ability to synchronize the Relation cache is new in dbt v0.15.0 - -::: - Materializations should [return](/reference/dbt-jinja-functions/return) the list of Relations that they have created at the end of execution. dbt will use this list of Relations to update the relation cache in order to reduce the number of queries executed against the database's `information_schema`. If a list of Relations is not returned, then dbt will raise a Deprecation Warning and infer the created relation from the model's configured database, schema, and alias. @@ -172,13 +171,6 @@ For more information on the `config` dbt Jinja function, see the [config](/refer ## Materialization precedence - -:::info New in 0.15.1 - -The materialization resolution order was poorly defined in versions of dbt prior to 0.15.1. Please use this guide for versions of dbt greater than or equal to 0.15.1. - -::: - dbt will pick the materialization macro in the following order (lower takes priority): 1. global project - default diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md b/website/docs/guides/custom-cicd-pipelines.md similarity index 57% rename from website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md rename to website/docs/guides/custom-cicd-pipelines.md index d618f9eec64..672c6e6dab8 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md +++ b/website/docs/guides/custom-cicd-pipelines.md @@ -1,12 +1,63 @@ --- -title: Run a dbt Cloud job on merge -id: 3-dbt-cloud-job-on-merge +title: Customizing CI/CD with custom pipelines +id: custom-cicd-pipelines +description: "Learn the benefits of version-controlled analytics code and custom pipelines in dbt for enhanced code testing and workflow automation during the development process." +displayText: Learn version-controlled code, custom pipelines, and enhanced code testing. +hoverSnippet: Learn version-controlled code, custom pipelines, and enhanced code testing. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['dbt Cloud', 'Orchestration', 'CI'] +level: 'Intermediate' +recently_updated: true --- -This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts presented here can be generalized and used in whatever way best suits your use case. +## Introduction + +One of the core tenets of dbt is that analytic code should be version controlled. This provides a ton of benefit to your organization in terms of collaboration, code consistency, stability, and the ability to roll back to a prior version. There’s an additional benefit that is provided with your code hosting platform that is often overlooked or underutilized. Some of you may have experience using dbt Cloud’s [webhook functionality](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) to run a job when a PR is created. This is a fantastic capability, and meets most use cases for testing your code before merging to production. However, there are circumstances when an organization needs additional functionality, like running workflows on every commit (linting), or running workflows after a merge is complete. In this article, we will show you how to setup custom pipelines to lint your project and trigger a dbt Cloud job via the API. + +A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch. + + +### What are pipelines? + +Pipelines (which are known by many names, such as workflows, actions, or build steps) are a series of pre-defined jobs that are triggered by specific events in your repository (PR created, commit pushed, branch merged, etc). Those jobs can do pretty much anything your heart desires assuming you have the proper security access and coding chops. + +Jobs are executed on [runners](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#runners), which are virtual servers. The runners come pre-configured with Ubuntu Linux, macOS, or Windows. That means the commands you execute are determined by the operating system of your runner. You’ll see how this comes into play later in the setup, but for now just remember that your code is executed on virtual servers that are, typically, hosted by the code hosting platform. + +![Diagram of how pipelines work](/img/guides/orchestration/custom-cicd-pipelines/pipeline-diagram.png) + +Please note, runners hosted by your code hosting platform provide a certain amount of free time. After that, billing charges may apply depending on how your account is setup. You also have the ability to host your own runners. That is beyond the scope of this article, but checkout the links below for more information if you’re interested in setting that up: + +- Repo-hosted runner billing information: + - [GitHub](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions) + - [GitLab](https://docs.gitlab.com/ee/ci/pipelines/cicd_minutes.html) + - [Bitbucket](https://bitbucket.org/product/features/pipelines#) +- Self-hosted runner information: + - [GitHub](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners) + - [GitLab](https://docs.gitlab.com/runner/) + - [Bitbucket](https://support.atlassian.com/bitbucket-cloud/docs/runners/) + +Additionally, if you’re using the free tier of GitLab you can still follow this guide, but it may ask you to provide a credit card to verify your account. You’ll see something like this the first time you try to run a pipeline: + +![Warning from GitLab showing payment information is required](/img/guides/orchestration/custom-cicd-pipelines/gitlab-cicd-payment-warning.png) -The setup below shows how to call the dbt Cloud API to run a job every time there is a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently). +### How to setup pipelines + +This guide provides details for multiple code hosting platforms. Where steps are unique, they are presented without a selection option. If code is specific to a platform (i.e. GitHub, GitLab, Bitbucket) you will see a selection option for each. + +Pipelines can be triggered by various events. The [dbt Cloud webhook](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) process already triggers a run if you want to run your jobs on a merge request, so this guide focuses on running pipelines for every push and when PRs are merged. Since pushes happen frequently in a project, we’ll keep this job super simple and fast by linting with SQLFluff. The pipeline that runs on merge requests will run less frequently, and can be used to call the dbt Cloud API to trigger a specific job. This can be helpful if you have specific requirements that need to happen when code is updated in production, like running a `--full-refresh` on all impacted incremental models. + +Here’s a quick look at what this pipeline will accomplish: + +![Diagram showing the pipelines to be created and the programs involved](/img/guides/orchestration/custom-cicd-pipelines/pipeline-programs-diagram.png) + +## Run a dbt Cloud job on merge + +This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts presented here can be generalized and used in whatever way best suits your use case. + +The setup below shows how to call the dbt Cloud API to run a job every time there's a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently). ### 1. Get your dbt Cloud API key @@ -28,7 +79,7 @@ Here’s a video showing the steps as well: ### 2. Put your dbt Cloud API key into your repo -This next part will happen in you code hosting platform. We need to save your API key from above into a repository secret so the job we create can access it. It is **not** recommended to ever save passwords or API keys in your code, so this step ensures that your key stays secure, but is still usable for your pipelines. +This next part will happen in you code hosting platform. We need to save your API key from above into a repository secret so the job we create can access it. It is **not** recommended to ever save passwords or API keys in your code, so this step ensures that your key stays secure, but is still usable for your pipelines. -In GitHub: - - Open up your repository where you want to run the pipeline (the same one that houses your dbt project) - Click *Settings* to open up the repository options - On the left click the *Security* dropdown - From that list, click on *Actions* - Towards the middle of the screen, click the *New repository secret* button - It will ask you for a name, so let’s call ours `DBT_API_KEY` - - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** + - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** - In the *Value* section, paste in the key you copied from dbt Cloud - Click *Add secret* and you’re all set! @@ -62,23 +111,21 @@ Here’s a video showing these steps: -In GitLab: - - Open up your repository where you want to run the pipeline (the same one that houses your dbt project) - Click *Settings* > *CI/CD* - Under the *Variables* section, click *Expand,* then click *Add variable* - It will ask you for a name, so let’s call ours `DBT_API_KEY` - - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** + - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** - In the *Value* section, paste in the key you copied from dbt Cloud - Make sure the check box next to *Protect variable* is unchecked, and the box next to *Mask variable* is selected (see below) - - “Protected” means that the variable is only available in pipelines that run on protected branches or protected tags - that won’t work for us because we want to run this pipeline on multiple branches. “Masked” means that it will be available to your pipeline runner, but will be masked in the logs. - + - “Protected” means that the variable is only available in pipelines that run on protected branches or protected tags - that won’t work for us because we want to run this pipeline on multiple branches. “Masked” means that it will be available to your pipeline runner, but will be masked in the logs. + ![View of the GitLab window for entering DBT_API_KEY](/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-gitlab.png) - + Here’s a video showing these steps: - + - + @@ -91,7 +138,7 @@ In Azure: - Select *Starter pipeline* (this will be updated later in Step 4) - Click on *Variables* and then *New variable* - In the *Name* field, enter the `DBT_API_KEY` - - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** + - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** - In the *Value* section, paste in the key you copied from dbt Cloud - Make sure the check box next to *Keep this value secret* is checked. This will mask the value in logs, and you won't be able to see the value for the variable in the UI. - Click *OK* and then *Save* to save the variable @@ -99,7 +146,7 @@ In Azure: - + In Bitbucket: @@ -108,16 +155,16 @@ In Bitbucket: - In the left menu, click *Repository Settings* - Scroll to the bottom of the left menu, and select *Repository variables* - In the *Name* field, input `DBT_API_KEY` - - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** + - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** - In the *Value* section, paste in the key you copied from dbt Cloud - Make sure the check box next to *Secured* is checked. This will mask the value in logs, and you won't be able to see the value for the variable in the UI. - Click *Add* to save the variable - + ![View of the Bitbucket window for entering DBT_API_KEY](/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-bitbucket.png) - + Here’s a video showing these steps: - + @@ -304,13 +351,12 @@ run-dbt-cloud-job: - For this new job, open the existing Azure pipeline you created above and select the *Edit* button. We'll want to edit the corresponding Azure pipeline YAML file with the appropriate configuration, instead of the starter code, along with including a `variables` section to pass in the required variables. -Copy the below YAML file into your Azure pipeline and update the variables below to match your setup based on the comments in the file. It's worth noting that we changed the `trigger` section so that it will run **only** when there are pushes to a branch named `main` (like a PR merged to your main branch). +Copy the below YAML file into your Azure pipeline and update the variables below to match your setup based on the comments in the file. It's worth noting that we changed the `trigger` section so that it will run **only** when there are pushes to a branch named `main` (like a PR merged to your main branch). Read through [Azure's docs](https://learn.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops) on these filters for additional use cases. @@ -406,13 +452,12 @@ pipelines: - ### 5. Test your new action -Now that you have a shiny new action, it’s time to test it out! Since this change is setup to only run on merges to your default branch, you’ll need to create and merge this change into your main branch. Once you do that, you’ll see a new pipeline job has been triggered to run the dbt Cloud job you assigned in the variables section. +Now that you have a shiny new action, it’s time to test it out! Since this change is setup to only run on merges to your default branch, you’ll need to create and merge this change into your main branch. Once you do that, you’ll see a new pipeline job has been triggered to run the dbt Cloud job you assigned in the variables section. Additionally, you’ll see the job in the run history of dbt Cloud. It should be fairly easy to spot because it will say it was triggered by the API, and the *INFO* section will have the branch you used for this guide. @@ -454,3 +499,140 @@ Additionally, you’ll see the job in the run history of dbt Cloud. It should be + +## Run a dbt Cloud job on pull request + +If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created. + +:::info Run on PR + +If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/ci-jobs). +This section is only for those projects that connect to their git repository using an SSH key. + +::: + +The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-5 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)** + +### 1. Create a pipeline job that runs when PRs are created + + + + +For this job, we'll set it up using the `bitbucket-pipelines.yml` file as in the prior step. The YAML file will look pretty similar to our earlier job, but we’ll pass in the required variables to the Python script using `export` statements. Update this section to match your setup based on the comments in the file. + +**What is this pipeline going to do?** +The setup below will trigger a dbt Cloud job to run every time a PR is opened in this repository. It will also run a fresh version of the pipeline for every commit that is made on the PR until it is merged. +For example: If you open a PR, it will run the pipeline. If you then decide additional changes are needed, and commit/push to the PR branch, a new pipeline will run with the updated code. + +The following varibles control this job: + +- `DBT_JOB_BRANCH`: Tells the dbt Cloud job to run the code in the branch that created this PR +- `DBT_JOB_SCHEMA_OVERRIDE`: Tells the dbt Cloud job to run this into a custom target schema + - The format of this will look like: `DBT_CLOUD_PR_{REPO_KEY}_{PR_NUMBER}` + +```yaml +image: python:3.11.1 + + +pipelines: + # This job will run when pull requests are created in the repository + pull-requests: + '**': + - step: + name: 'Run dbt Cloud PR Job' + script: + # Check to only build if PR destination is master (or other branch). + # Comment or remove line below if you want to run on all PR's regardless of destination branch. + - if [ "${BITBUCKET_PR_DESTINATION_BRANCH}" != "main" ]; then printf 'PR Destination is not master, exiting.'; exit; fi + - export DBT_URL="https://cloud.getdbt.com" + - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job" + - export DBT_JOB_BRANCH=$BITBUCKET_BRANCH + - export DBT_JOB_SCHEMA_OVERRIDE="DBT_CLOUD_PR_"$BITBUCKET_PROJECT_KEY"_"$BITBUCKET_PR_ID + - export DBT_ACCOUNT_ID=00000 # enter your account id here + - export DBT_PROJECT_ID=00000 # enter your project id here + - export DBT_PR_JOB_ID=00000 # enter your job id here + - python python/run_and_monitor_dbt_job.py +``` + + + + +### 2. Confirm the pipeline runs + +Now that you have a new pipeline, it's time to run it and make sure it works. Since this only triggers when a PR is created, you'll need to create a new PR on a branch that contains the code above. Once you do that, you should see a pipeline that looks like this: + + + + +Bitbucket pipeline: +![dbt run on PR job in Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png) + +dbt Cloud job: +![dbt Cloud job showing it was triggered by Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png) + + + + +### 3. Handle those extra schemas in your database + +As noted above, when the PR job runs it will create a new schema based on the PR. To avoid having your database overwhelmed with PR schemas, consider adding a "cleanup" job to your dbt Cloud account. This job can run on a scheduled basis to cleanup any PR schemas that haven't been updated/used recently. + +Add this as a macro to your project. It takes 2 arguments that lets you control which schema get dropped: + +- `age_in_days`: The number of days since the schema was last altered before it should be dropped (default 10 days) +- `database_to_clean`: The name of the database to remove schemas from + +```sql +{# + This macro finds PR schemas older than a set date and drops them + The macro defaults to 10 days old, but can be configured with the input argument age_in_days + Sample usage with different date: + dbt run-operation pr_schema_cleanup --args "{'database_to_clean': 'analytics','age_in_days':'15'}" +#} +{% macro pr_schema_cleanup(database_to_clean, age_in_days=10) %} + + {% set find_old_schemas %} + select + 'drop schema {{ database_to_clean }}.'||schema_name||';' + from {{ database_to_clean }}.information_schema.schemata + where + catalog_name = '{{ database_to_clean | upper }}' + and schema_name ilike 'DBT_CLOUD_PR%' + and last_altered <= (current_date() - interval '{{ age_in_days }} days') + {% endset %} + + {% if execute %} + + {{ log('Schema drop statements:' ,True) }} + + {% set schema_drop_list = run_query(find_old_schemas).columns[0].values() %} + + {% for schema_to_drop in schema_drop_list %} + {% do run_query(schema_to_drop) %} + {{ log(schema_to_drop ,True) }} + {% endfor %} + + {% endif %} + +{% endmacro %} +``` + +This macro goes into a dbt Cloud job that is run on a schedule. The command will look like this (text below for copy/paste): +![dbt Cloud job showing the run operation command for the cleanup macro](/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png) +`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"` + +## Consider risk of conflicts when using multiple orchestration tools + +Running dbt Cloud jobs through a CI/CD pipeline is a form of job orchestration. If you also run jobs using dbt Cloud’s built in scheduler, you now have 2 orchestration tools running jobs. The risk with this is that you could run into conflicts - you can imagine a case where you are triggering a pipeline on certain actions and running scheduled jobs in dbt Cloud, you would probably run into job clashes. The more tools you have, the more you have to make sure everything talks to each other. + +That being said, if **the only reason you want to use pipelines is for adding a lint check or run on merge**, you might decide the pros outweigh the cons, and as such you want to go with a hybrid approach. Just keep in mind that if two processes try and run the same job at the same time, dbt Cloud will queue the jobs and run one after the other. It’s a balancing act but can be accomplished with diligence to ensure you’re orchestrating jobs in a manner that does not conflict. diff --git a/website/docs/quickstarts/databricks-qs.md b/website/docs/guides/databricks-qs.md similarity index 98% rename from website/docs/quickstarts/databricks-qs.md rename to website/docs/guides/databricks-qs.md index 1222ef2a7d5..5a0c5536e7f 100644 --- a/website/docs/quickstarts/databricks-qs.md +++ b/website/docs/guides/databricks-qs.md @@ -1,9 +1,11 @@ --- title: "Quickstart for dbt Cloud and Databricks" id: "databricks" -platform: 'dbt-cloud' +level: 'Beginner' icon: 'databricks' hide_table_of_contents: true +recently_updated: true +tags: ['dbt Cloud', 'Quickstart','Databricks'] --- ## Introduction @@ -30,8 +32,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen ### Related content - Learn more with [dbt Courses](https://courses.getdbt.com/collections) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md b/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md deleted file mode 100644 index 0959dbee707..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: "What are adapters? Why do we need them?" -id: "1-what-are-adapters" ---- - -Adapters are an essential component of dbt. At their most basic level, they are how dbt Core connects with the various supported data platforms. At a higher-level, dbt Core adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt Core. - - Navigating and developing around the nuances of different databases can be daunting, but you are not alone. Visit [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel for additional help beyond the documentation. - -## All databases are not the same - -There's a tremendous amount of work that goes into creating a database. Here is a high-level list of typical database layers (from the outermost layer moving inwards): -- SQL API -- Client Library / Driver -- Server Connection Manager -- Query parser -- Query optimizer -- Runtime -- Storage Access Layer -- Storage - -There's a lot more there than just SQL as a language. Databases (and data warehouses) are so popular because you can abstract away a great deal of the complexity from your brain to the database itself. This enables you to focus more on the data. - -dbt allows for further abstraction and standardization of the outermost layers of a database (SQL API, client library, connection manager) into a framework that both: - - Opens database technology to less technical users (a large swath of a DBA's role has been automated, similar to how the vast majority of folks with websites today no longer have to be "[webmasters](https://en.wikipedia.org/wiki/Webmaster)"). - - Enables more meaningful conversations about how data warehousing should be done. - -This is where dbt adapters become critical. - -## What needs to be adapted? - -dbt adapters are responsible for _adapting_ dbt's standard functionality to a particular database. Our prototypical database and adapter are PostgreSQL and dbt-postgres, and most of our adapters are somewhat based on the functionality described in dbt-postgres. - -Connecting dbt to a new database will require a new adapter to be built or an existing adapter to be extended. - -The outermost layers of a database map roughly to the areas in which the dbt adapter framework encapsulates inter-database differences. - -### SQL API - -Even amongst ANSI-compliant databases, there are differences in the SQL grammar. -Here are some categories and examples of SQL statements that can be constructed differently: - - -| Category | Area of differences | Examples | -|----------------------------------------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Statement syntax | The use of `IF EXISTS` |
    • `IF
    • EXISTS, DROP TABLE`
    • `DROP
    • IF EXISTS` | -| Workflow definition & semantics | Incremental updates |
    • `MERGE`
    • `DELETE; INSERT`
    • | -| Relation and column attributes/configuration | Database-specific materialization configs |
    • `DIST = ROUND_ROBIN` (Synapse)
    • `DIST = EVEN` (Redshift)
    • | -| Permissioning | Grant statements that can only take one grantee at a time vs those that accept lists of grantees |
    • `grant SELECT on table dinner.corn to corn_kid, everyone`
    • `grant SELECT on table dinner.corn to corn_kid; grant SELECT on table dinner.corn to everyone`
    • | - -### Python Client Library & Connection Manager - -The other big category of inter-database differences comes with how the client connects to the database and executes queries against the connection. To integrate with dbt, a data platform must have a pre-existing python client library or support ODBC, using a generic python library like pyodbc. - -| Category | Area of differences | Examples | -|------------------------------|-------------------------------------------|-------------------------------------------------------------------------------------------------------------| -| Credentials & authentication | Authentication |
    • Username & password
    • MFA with `boto3` or Okta token
    • | -| Connection opening/closing | Create a new connection to db |
    • `psycopg2.connect(connection_string)`
    • `google.cloud.bigquery.Client(...)`
    • | -| Inserting local data | Load seed .`csv` files into Python memory |
    • `google.cloud.bigquery.Client.load_table_from_file(...)` (BigQuery)
    • `INSERT ... INTO VALUES ...` prepared statement (most other databases)
    • | - - -## How dbt encapsulates and abstracts these differences - -Differences between databases are encoded into discrete areas: - -| Components | Code Path | Function | -|------------------|---------------------------------------------------|-------------------------------------------------------------------------------| -| Python Classes | `adapters/` | Configuration (See above [Python classes](##python classes) | -| Macros | `include//macros/adapters/` | SQL API & statement syntax (for example, how to create schema or how to get table info) | -| Materializations | `include//macros/materializations/` | Table/view/snapshot/ workflow definitions | - - -### Python Classes - -These classes implement all the methods responsible for: -- Connecting to a database and issuing queries. -- Providing dbt with database-specific configuration information. - -| Class | Description | -|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| AdapterClass | High-level configuration type conversion and any database-specific python methods needed | -| AdapterCredentials | Typed dictionary of possible profiles and associated methods | -| AdapterConnectionManager | All the methods responsible for connecting to a database and issuing queries | -| AdapterRelation | How relation names should be rendered, printed, and quoted. Do relation names use all three parts? `catalog.model_name` (two-part name) or `database.schema.model_name` (three-part name) | -| AdapterColumn | How names should be rendered, and database-specific properties | - -### Macros - -A set of *macros* responsible for generating SQL that is compliant with the target database. - -### Materializations - -A set of *materializations* and their corresponding helper macros defined in dbt using jinja and SQL. They codify for dbt how model files should be persisted into the database. - -## Adapter Architecture - - -Below is a diagram of how dbt-postgres, the adapter at the center of dbt-core, works. - - diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md deleted file mode 100644 index 28cd8935937..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Prerequisites for a new adapter" -id: "2-prerequisites-for-a-new-adapter" ---- - -To learn what an adapter is and they role they serve, see [What are adapters?](1-what-are-adapters) - -It is very important that make sure that you have the right skills, and to understand the level of difficulty required to make an adapter for your data platform. - -## Pre-Requisite Data Warehouse Features - -The more you can answer Yes to the below questions, the easier your adapter development (and user-) experience will be. See the [New Adapter Information Sheet wiki](https://github.com/dbt-labs/dbt-core/wiki/New-Adapter-Information-Sheet) for even more specific questions. - -### Training -- the developer (and any product managers) ideally will have substantial experience as an end-user of dbt. If not, it is highly advised that you at least take the [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) and [Advanced Materializations](https://courses.getdbt.com/courses/advanced-materializations) course. - -### Database -- Does the database complete transactions fast enough for interactive development? -- Can you execute SQL against the data platform? -- Is there a concept of schemas? -- Does the data platform support ANSI SQL, or at least a subset? -### Driver / Connection Library -- Is there a Python-based driver for interacting with the database that is db API 2.0 compliant (e.g. Psycopg2 for Postgres, pyodbc for SQL Server) -- Does it support: prepared statements, multiple statements, or single sign on token authorization to the data platform? - -### Open source software -- Does your organization have an established process for publishing open source software? - - -It is easiest to build an adapter for dbt when the following the /platform in question has: -- a conventional ANSI-SQL interface (or as close to it as possible), -- a mature connection library/SDK that uses ODBC or Python DB 2 API, and -- a way to enable developers to iterate rapidly with both quick reads and writes - - -## Maintaining your new adapter - -When your adapter becomes more popular, and people start using it, you may quickly become the maintainer of an increasingly popular open source project. With this new role, comes some unexpected responsibilities that not only include code maintenance, but also working with a community of users and contributors. To help people understand what to expect of your project, you should communicate your intentions early and often in your adapter documentation or README. Answer questions like, Is this experimental work that people should use at their own risk? Or is this production-grade code that you're committed to maintaining into the future? - -### Keeping the code compatible with dbt Core - -New minor version releases of `dbt-core` may include changes to the Python interface for adapter plugins, as well as new or updated test cases. The maintainers of `dbt-core` will clearly communicate these changes in documentation and release notes, and they will aim for backwards compatibility whenever possible. - -Patch releases of `dbt-core` will _not_ include breaking changes to adapter-facing code. For more details, see ["About dbt Core versions"](/docs/dbt-versions/core). - -### Versioning and releasing your adapter - -We strongly encourage you to adopt the following approach when versioning and releasing your plugin: -- The minor version of your plugin should match the minor version in `dbt-core` (e.g. 1.1.x). -- Aim to release a new version of your plugin for each new minor version of `dbt-core` (once every three months). -- While your plugin is new, and you're iterating on features, aim to offer backwards compatibility and deprecation notices for at least one minor version. As your plugin matures, aim to leave backwards compatibility and deprecation notices in place until the next major version (dbt Core v2). -- Release patch versions of your plugins whenever needed. These patch releases should contain fixes _only_. diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md deleted file mode 100644 index 43826ca4b1d..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md +++ /dev/null @@ -1,416 +0,0 @@ ---- -title: "Building a new adapter" -id: "3-building-a-new-adapter" ---- - -:::tip -Before you build your adapter, we strongly encourage you to first learn dbt as an end user, learn [what an adapter is and the role they serve](1-what-are-adapters), as well as [data platform prerequisites](2-prerequisites-for-a-new-adapter) -::: - - -This guide will walk you through the first creating the necessary adapter classes and macros, and provide some resources to help you validate that your new adapter is working correctly. Once the adapter is passing most of the functional tests (see ["Testing a new adapter"](4-testing-a-new-adapter) -), please let the community know that is available to use by adding the adapter to the ["Supported Data Platforms"](/docs/supported-data-platforms) page by following the steps given in [Documenting your adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter). - -For any questions you may have, don't hesitate to ask in the [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel. The community is very helpful and likely has experienced a similar issue as you. - -## Scaffolding a new adapter - To create a new adapter plugin from scratch, you can use the [dbt-database-adapter-scaffold](https://github.com/dbt-labs/dbt-database-adapter-scaffold) to trigger an interactive session which will generate a scaffolding for you to build upon. - - Example usage: - - ``` - $ cookiecutter gh:dbt-labs/dbt-database-adapter-scaffold - ``` - -The generated boilerplate starting project will include a basic adapter plugin file structure, examples of macros, high level method descriptions, etc. - -One of the most important choices you will make during the cookiecutter generation will revolve around the field for `is_sql_adapter` which is a boolean used to correctly apply imports for either a `SQLAdapter` or `BaseAdapter`. Knowing which you will need requires a deeper knowledge of your selected database but a few good guides for the choice are. -- Does your database have a complete SQL API? Can it perform tasks using SQL such as creating schemas, dropping schemas, querying an `information_schema` for metadata calls? If so, it is more likely to be a SQLAdapter where you set `is_sql_adapter` to `True`. -- Most adapters do fall under SQL adapters which is why we chose it as the default `True` value. -- It is very possible to build out a fully functional `BaseAdapter`. This will require a little more ground work as it doesn't come with some prebuilt methods the `SQLAdapter` class provides. See `dbt-bigquery` as a good guide. - -## Implementation Details - -Regardless if you decide to use the cookiecutter template or manually create the plugin, this section will go over each method that is required to be implemented. The table below provides a high-level overview of the classes, methods, and macros you may have to define for your data platform. - -| file | component | purpose | -|---------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `./setup.py` | `setup()` function | adapter meta-data (package name, version, author, homepage, etc) | -| `myadapter/dbt/adapters/myadapter/__init__.py` | `AdapterPlugin` | bundle all the information below into a dbt plugin | -| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterCredentials` class | parameters to connect to and configure the database, via a the chosen Python driver | -| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterConnectionManager` class | telling dbt how to interact with the database w.r.t opening/closing connections, executing queries, and fetching data. Effectively a wrapper around the db API or driver. | -| `myadapter/dbt/include/bigquery/` | a dbt project of macro "overrides" in the format of "myadapter__" | any differences in SQL syntax for regular db operations will be modified here from the global_project (e.g. "Create Table As Select", "Get all relations in the current schema", etc) | -| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterConfig` | database- and relation-level configs and | -| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterAdapter` | for changing _how_ dbt performs operations like macros and other needed Python functionality | -| `myadapter/dbt/adapters/myadapter/column.py` | `MyAdapterColumn` | for defining database-specific column such as datatype mappings | - -### Editing `setup.py` - -Edit the file at `myadapter/setup.py` and fill in the missing information. - -You can skip this step if you passed the arguments for `email`, `url`, `author`, and `dependencies` to the cookiecutter template script. If you plan on having nested macro folder structures, you may need to add entries to `package_data` so your macro source files get installed. - -### Editing the connection manager - -Edit the connection manager at `myadapter/dbt/adapters/myadapter/connections.py`. This file is defined in the sections below. - -#### The Credentials class - -The credentials class defines all of the database-specific credentials (e.g. `username` and `password`) that users will need in the [connection profile](/docs/supported-data-platforms) for your new adapter. Each credentials contract should subclass dbt.adapters.base.Credentials, and be implemented as a python dataclass. - -Note that the base class includes required database and schema fields, as dbt uses those values internally. - -For example, if your adapter requires a host, integer port, username string, and password string, but host is the only required field, you'd add definitions for those new properties to the class as types, like this: - - - -```python - -from dataclasses import dataclass -from typing import Optional - -from dbt.adapters.base import Credentials - - -@dataclass -class MyAdapterCredentials(Credentials): - host: str - port: int = 1337 - username: Optional[str] = None - password: Optional[str] = None - - @property - def type(self): - return 'myadapter' - - @property - def unique_field(self): - """ - Hashed and included in anonymous telemetry to track adapter adoption. - Pick a field that can uniquely identify one team/organization building with this adapter - """ - return self.host - - def _connection_keys(self): - """ - List of keys to display in the `dbt debug` output. - """ - return ('host', 'port', 'database', 'username') -``` - - - -There are a few things you can do to make it easier for users when connecting to your database: -- Be sure to implement the Credentials' `_connection_keys` method shown above. This method will return the keys that should be displayed in the output of the `dbt debug` command. As a general rule, it's good to return all the arguments used in connecting to the actual database except the password (even optional arguments). -- Create a `profile_template.yml` to enable configuration prompts for a brand-new user setting up a connection profile via the [`dbt init` command](/reference/commands/init). See more details [below](#other-files). -- You may also want to define an `ALIASES` mapping on your Credentials class to include any config names you want users to be able to use in place of 'database' or 'schema'. For example if everyone using the MyAdapter database calls their databases "collections", you might do: - - - -```python -@dataclass -class MyAdapterCredentials(Credentials): - host: str - port: int = 1337 - username: Optional[str] = None - password: Optional[str] = None - - ALIASES = { - 'collection': 'database', - } -``` - - - -Then users can use `collection` OR `database` in their `profiles.yml`, `dbt_project.yml`, or `config()` calls to set the database. - -#### `ConnectionManager` class methods - -Once credentials are configured, you'll need to implement some connection-oriented methods. They are enumerated in the SQLConnectionManager docstring, but an overview will also be provided here. - -**Methods to implement:** -- `open` -- `get_response` -- `cancel` -- `exception_handler` -- `standardize_grants_dict` - -##### `open(cls, connection)` - -`open()` is a classmethod that gets a connection object (which could be in any state, but will have a `Credentials` object with the attributes you defined above) and moves it to the 'open' state. - -Generally this means doing the following: - - if the connection is open already, log and return it. - - If a database needed changes to the underlying connection before re-use, that would happen here - - create a connection handle using the underlying database library using the credentials - - on success: - - set connection.state to `'open'` - - set connection.handle to the handle object - - this is what must have a `cursor()` method that returns a cursor! - - on error: - - set connection.state to `'fail'` - - set connection.handle to `None` - - raise a `dbt.exceptions.FailedToConnectException` with the error and any other relevant information - -For example: - - - -```python - @classmethod - def open(cls, connection): - if connection.state == 'open': - logger.debug('Connection is already open, skipping open.') - return connection - - credentials = connection.credentials - - try: - handle = myadapter_library.connect( - host=credentials.host, - port=credentials.port, - username=credentials.username, - password=credentials.password, - catalog=credentials.database - ) - connection.state = 'open' - connection.handle = handle - return connection -``` - - - -##### `get_response(cls, cursor)` - -`get_response` is a classmethod that gets a cursor object and returns adapter-specific information about the last executed command. The return value should be an `AdapterResponse` object that includes items such as `code`, `rows_affected`, `bytes_processed`, and a summary `_message` for logging to stdout. - - - -```python - @classmethod - def get_response(cls, cursor) -> AdapterResponse: - code = cursor.sqlstate or "OK" - rows = cursor.rowcount - status_message = f"{code} {rows}" - return AdapterResponse( - _message=status_message, - code=code, - rows_affected=rows - ) -``` - - - -##### `cancel(self, connection)` - -`cancel` is an instance method that gets a connection object and attempts to cancel any ongoing queries, which is database dependent. Some databases don't support the concept of cancellation, they can simply implement it via 'pass' and their adapter classes should implement an `is_cancelable` that returns False - On ctrl+c connections may remain running. This method must be implemented carefully, as the affected connection will likely be in use in a different thread. - - - -```python - def cancel(self, connection): - tid = connection.handle.transaction_id() - sql = 'select cancel_transaction({})'.format(tid) - logger.debug("Cancelling query '{}' ({})".format(connection_name, pid)) - _, cursor = self.add_query(sql, 'master') - res = cursor.fetchone() - logger.debug("Canceled query '{}': {}".format(connection_name, res)) -``` - - - -##### `exception_handler(self, sql, connection_name='master')` - -`exception_handler` is an instance method that returns a context manager that will handle exceptions raised by running queries, catch them, log appropriately, and then raise exceptions dbt knows how to handle. - -If you use the (highly recommended) `@contextmanager` decorator, you only have to wrap a `yield` inside a `try` block, like so: - - - -```python - @contextmanager - def exception_handler(self, sql: str): - try: - yield - except myadapter_library.DatabaseError as exc: - self.release(connection_name) - - logger.debug('myadapter error: {}'.format(str(e))) - raise dbt.exceptions.DatabaseException(str(exc)) - except Exception as exc: - logger.debug("Error running SQL: {}".format(sql)) - logger.debug("Rolling back transaction.") - self.release(connection_name) - raise dbt.exceptions.RuntimeException(str(exc)) -``` - - - -##### `standardize_grants_dict(self, grants_table: agate.Table) -> dict` - -`standardize_grants_dict` is an method that returns the dbt-standardized grants dictionary that matches how users configure grants now in dbt. The input is the result of `SHOW GRANTS ON {{model}}` call loaded into an agate table. - -If there's any massaging of agate table containing the results, of `SHOW GRANTS ON {{model}}`, that can't easily be accomplished in SQL, it can be done here. For example, the SQL to show grants *should* filter OUT any grants TO the current user/role (e.g. OWNERSHIP). If that's not possible in SQL, it can be done in this method instead. - - - -```python - @available - def standardize_grants_dict(self, grants_table: agate.Table) -> dict: - """ - :param grants_table: An agate table containing the query result of - the SQL returned by get_show_grant_sql - :return: A standardized dictionary matching the `grants` config - :rtype: dict - """ - grants_dict: Dict[str, List[str]] = {} - for row in grants_table: - grantee = row["grantee"] - privilege = row["privilege_type"] - if privilege in grants_dict.keys(): - grants_dict[privilege].append(grantee) - else: - grants_dict.update({privilege: [grantee]}) - return grants_dict -``` - - - -### Editing the adapter implementation - -Edit the connection manager at `myadapter/dbt/adapters/myadapter/impl.py` - -Very little is required to implement the adapter itself. On some adapters, you will not need to override anything. On others, you'll likely need to override some of the ``convert_*`` classmethods, or override the `is_cancelable` classmethod on others to return `False`. - - -#### `datenow()` - -This classmethod provides the adapter's canonical date function. This is not used but is required– anyway on all adapters. - - - -```python - @classmethod - def date_function(cls): - return 'datenow()' -``` - - - -### Editing SQL logic - -dbt implements specific SQL operations using jinja macros. While reasonable defaults are provided for many such operations (like `create_schema`, `drop_schema`, `create_table`, etc), you may need to override one or more of macros when building a new adapter. - -#### Required macros - -The following macros must be implemented, but you can override their behavior for your adapter using the "dispatch" pattern described below. Macros marked (required) do not have a valid default implementation, and are required for dbt to operate. - -- `alter_column_type` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L37-L55)) -- `check_schema_exists` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L43-L55)) -- `create_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L1-L9)) -- `drop_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L34-L42)) -- `drop_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L12-L20)) -- `get_columns_in_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L1-L8)) (required) -- `list_relations_without_caching` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L58-L65)) (required) -- `list_schemas` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L29-L40)) -- `rename_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L56-L65)) -- `truncate_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L45-L53)) -- `current_timestamp` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/freshness.sql#L1-L8)) (required) -- `copy_grants` - -#### Adapter dispatch - -Most modern databases support a majority of the standard SQL spec. There are some databases that _do not_ support critical aspects of the SQL spec however, or they provide their own nonstandard mechanisms for implementing the same functionality. To account for these variations in SQL support, dbt provides a mechanism called [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch) for macros. With this feature, macros can be overridden for specific adapters. This makes it possible to implement high-level methods (like "create ") in a database-specific way. - - - -```jinja2 - -{# dbt will call this macro by name, providing any arguments #} -{% macro create_table_as(temporary, relation, sql) -%} - - {# dbt will dispatch the macro call to the relevant macro #} - {{ return( - adapter.dispatch('create_table_as')(temporary, relation, sql) - ) }} -{%- endmacro %} - - - -{# If no macro matches the specified adapter, "default" will be used #} -{% macro default__create_table_as(temporary, relation, sql) -%} - ... -{%- endmacro %} - - - -{# Example which defines special logic for Redshift #} -{% macro redshift__create_table_as(temporary, relation, sql) -%} - ... -{%- endmacro %} - - - -{# Example which defines special logic for BigQuery #} -{% macro bigquery__create_table_as(temporary, relation, sql) -%} - ... -{%- endmacro %} -``` - - - -The `adapter.dispatch()` macro takes a second argument, `packages`, which represents a set of "search namespaces" in which to find potential implementations of a dispatched macro. This allows users of community-supported adapters to extend or "shim" dispatched macros from common packages, such as `dbt-utils`, with adapter-specific versions in their own project or other installed packages. See: -- "Shim" package examples: [`spark-utils`](https://github.com/dbt-labs/spark-utils), [`tsql-utils`](https://github.com/dbt-msft/tsql-utils) -- [`adapter.dispatch` docs](/reference/dbt-jinja-functions/dispatch) - -#### Overriding adapter methods - -While much of dbt's adapter-specific functionality can be modified in adapter macros, it can also make sense to override adapter methods directly. In this example, assume that a database does not support a `cascade` parameter to `drop schema`. Instead, we can implement an approximation where we drop each relation and then drop the schema. - - - -```python - def drop_schema(self, relation: BaseRelation): - relations = self.list_relations( - database=relation.database, - schema=relation.schema - ) - for relation in relations: - self.drop_relation(relation) - super().drop_schema(relation) -``` - - - -#### Grants Macros - -See [this GitHub discussion](https://github.com/dbt-labs/dbt-core/discussions/5468) for information on the macros required for `GRANT` statements: -### Other files - -#### `profile_template.yml` - -In order to enable the [`dbt init` command](/reference/commands/init) to prompt users when setting up a new project and connection profile, you should include a **profile template**. The filepath needs to be `dbt/include//profile_template.yml`. It's possible to provide hints, default values, and conditional prompts based on connection methods that require different supporting attributes. Users will also be able to include custom versions of this file in their own projects, with fixed values specific to their organization, to support their colleagues when using your dbt adapter for the first time. - -See examples: -- [dbt-postgres](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml) -- [dbt-redshift](https://github.com/dbt-labs/dbt-redshift/blob/main/dbt/include/redshift/profile_template.yml) -- [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/include/snowflake/profile_template.yml) -- [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery/blob/main/dbt/include/bigquery/profile_template.yml) - -#### `__version__.py` - -To assure that `dbt --version` provides the latest dbt core version the adapter supports, be sure include a `__version__.py` file. The filepath will be `dbt/adapters//__version__.py`. We recommend using the latest dbt core version and as the adapter is made compatible with later versions, this file will need to be updated. For a sample file, check out this [example](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/adapters/snowflake/__version__.py). - -It should be noted that both of these files are included in the bootstrapped output of the `dbt-database-adapter-scaffold` so when using the scaffolding, these files will be included. - -## Testing your new adapter - -This has moved to its own page: ["Testing a new adapter"](4-testing-a-new-adapter) - -## Documenting your new adapter - -This has moved to its own page: ["Documenting a new adapter"](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) - -## Maintaining your new adapter - -This has moved to a new spot: ["Maintaining your new adapter"](2-prerequisites-for-a-new-adapter##maintaining-your-new-adapter) diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md deleted file mode 100644 index b1b5072670a..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md +++ /dev/null @@ -1,499 +0,0 @@ ---- -title: "Testing a new adapter" -id: "4-testing-a-new-adapter" ---- - -:::info - -Previously, we offered a packaged suite of tests for dbt adapter functionality: [`pytest-dbt-adapter`](https://github.com/dbt-labs/dbt-adapter-tests). We are deprecating that suite, in favor of the newer testing framework outlined in this document. - -::: - -This document has two sections: - -1. "[About the testing framework](#about-the-testing-framework)" describes the standard framework that we maintain for using pytest together with dbt. It includes an example that shows the anatomy of a simple test case. -2. "[Testing your adapter](#testing-your-adapter)" offers a step-by-step guide for using our out-of-the-box suite of "basic" tests, which will validate that your adapter meets a baseline of dbt functionality. - -## Prerequisites - -- Your adapter must be compatible with dbt-core **v1.1** or newer -- You should be familiar with **pytest**: https://docs.pytest.org/ - -## About the testing framework - -dbt-core offers a standard framework for running pre-built functional tests, and for defining your own tests. The core testing framework is built using `pytest`, a mature and standard library for testing Python projects. - -The **[`tests` module](https://github.com/dbt-labs/dbt-core/tree/HEAD/core/dbt/tests)** within `dbt-core` includes basic utilities for setting up pytest + dbt. These are used by all "pre-built" functional tests, and make it possible to quickly write your own tests. - -Those utilities allow you to do three basic things: -1. **Quickly set up a dbt "project."** Define project resources via methods such as `models()` and `seeds()`. Use `project_config_update()` to pass configurations into `dbt_project.yml`. -2. **Define a sequence of dbt commands.** The most important utility is `run_dbt()`, which returns the [results](/reference/dbt-classes#result-objects) of each dbt command. It takes a list of CLI specifiers (subcommand + flags), as well as an optional second argument, `expect_pass=False`, for cases where you expect the command to fail. -3. **Validate the results of those dbt commands.** For example, `check_relations_equal()` asserts that two database objects have the same structure and content. You can also write your own `assert` statements, by inspecting the results of a dbt command, or querying arbitrary database objects with `project.run_sql()`. - -You can see the full suite of utilities, with arguments and annotations, in [`util.py`](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/tests/util.py). You'll also see them crop up across a number of test cases. While all utilities are intended to be reusable, you won't need all of them for every test. In the example below, we'll show a simple test case that uses only a few utilities. - -### Example: a simple test case - -This example will show you the anatomy of a test case using dbt + pytest. We will create reusable components, combine them to form a dbt "project", and define a sequence of dbt commands. Then, we'll use Python `assert` statements to ensure those commands succeed (or fail) as we expect. - -In ["Getting started running basic tests,"](#getting-started-running-basic-tests) we'll offer step-by-step instructions for installing and configuring `pytest`, so that you can run it on your own machine. For now, it's more important to see how the pieces of a test case fit together. - -This example includes a seed, a model, and two tests—one of which will fail. - -1. Define Python strings that will represent the file contents in your dbt project. Defining these in a separate file enables you to reuse the same components across different test cases. The pytest name for this type of reusable component is "fixture." - - - -```python -# seeds/my_seed.csv -my_seed_csv = """ -id,name,some_date -1,Easton,1981-05-20T06:46:51 -2,Lillian,1978-09-03T18:10:33 -3,Jeremiah,1982-03-11T03:59:51 -4,Nolan,1976-05-06T20:21:35 -""".lstrip() - -# models/my_model.sql -my_model_sql = """ -select * from {{ ref('my_seed') }} -union all -select null as id, null as name, null as some_date -""" - -# models/my_model.yml -my_model_yml = """ -version: 2 -models: - - name: my_model - columns: - - name: id - tests: - - unique - - not_null # this test will fail -""" -``` - - - -2. Use the "fixtures" to define the project for your test case. These fixtures are always scoped to the **class**, where the class represents one test case—that is, one dbt project or scenario. (The same test case can be used for one or more actual tests, which we'll see in step 3.) Following the default pytest configurations, the file name must begin with `test_`, and the class name must begin with `Test`. - - - -```python -import pytest -from dbt.tests.util import run_dbt - -# our file contents -from tests.functional.example.fixtures import ( - my_seed_csv, - my_model_sql, - my_model_yml, -) - -# class must begin with 'Test' -class TestExample: - """ - Methods in this class will be of two types: - 1. Fixtures defining the dbt "project" for this test case. - These are scoped to the class, and reused for all tests in the class. - 2. Actual tests, whose names begin with 'test_'. - These define sequences of dbt commands and 'assert' statements. - """ - - # configuration in dbt_project.yml - @pytest.fixture(scope="class") - def project_config_update(self): - return { - "name": "example", - "models": {"+materialized": "view"} - } - - # everything that goes in the "seeds" directory - @pytest.fixture(scope="class") - def seeds(self): - return { - "my_seed.csv": my_seed_csv, - } - - # everything that goes in the "models" directory - @pytest.fixture(scope="class") - def models(self): - return { - "my_model.sql": my_model_sql, - "my_model.yml": my_model_yml, - } - - # continues below -``` - - - -3. Now that we've set up our project, it's time to define a sequence of dbt commands and assertions. We define one or more methods in the same file, on the same class (`TestExampleFailingTest`), whose names begin with `test_`. These methods share the same setup (project scenario) from above, but they can be run independently by pytest—so they shouldn't depend on each other in any way. - - - -```python - # continued from above - - # The actual sequence of dbt commands and assertions - # pytest will take care of all "setup" + "teardown" - def test_run_seed_test(self, project): - """ - Seed, then run, then test. We expect one of the tests to fail - An alternative pattern is to use pytest "xfail" (see below) - """ - # seed seeds - results = run_dbt(["seed"]) - assert len(results) == 1 - # run models - results = run_dbt(["run"]) - assert len(results) == 1 - # test tests - results = run_dbt(["test"], expect_pass = False) # expect failing test - assert len(results) == 2 - # validate that the results include one pass and one failure - result_statuses = sorted(r.status for r in results) - assert result_statuses == ["fail", "pass"] - - @pytest.mark.xfail - def test_build(self, project): - """Expect a failing test""" - # do it all - results = run_dbt(["build"]) -``` - - - -3. Our test is ready to run! The last step is to invoke `pytest` from your command line. We'll walk through the actual setup and configuration of `pytest` in the next section. - - - -```sh -$ python3 -m pytest tests/functional/test_example.py -=========================== test session starts ============================ -platform ... -- Python ..., pytest-..., pluggy-... -rootdir: ... -plugins: ... - -tests/functional/test_example.py .X [100%] - -======================= 1 passed, 1 xpassed in 1.38s ======================= -``` - - - -You can find more ways to run tests, along with a full command reference, in the [pytest usage docs](https://docs.pytest.org/how-to/usage.html). - -We've found the `-s` flag (or `--capture=no`) helpful to print logs from the underlying dbt invocations, and to step into an interactive debugger if you've added one. You can also use environment variables to set [global dbt configs](/reference/global-configs/about-global-configs), such as `DBT_DEBUG` (to show debug-level logs). - -## Testing your adapter - -Anyone who installs `dbt-core`, and wishes to define their own test cases, can use the framework presented in the first section. The framework is especially useful for testing standard dbt behavior across different databases. - -To that end, we have built and made available a [package of reusable adapter test cases](https://github.com/dbt-labs/dbt-core/tree/HEAD/tests/adapter), for creators and maintainers of adapter plugins. These test cases cover basic expected functionality, as well as functionality that frequently requires different implementations across databases. - -For the time being, this package is also located within the `dbt-core` repository, but separate from the `dbt-core` Python package. - -### Categories of tests - -In the course of creating and maintaining your adapter, it's likely that you will end up implementing tests that fall into three broad categories: - -1. **Basic tests** that every adapter plugin is expected to pass. These are defined in `tests.adapter.basic`. Given differences across data platforms, these may require slight modification or reimplementation. Significantly overriding or disabling these tests should be with good reason, since each represents basic functionality expected by dbt users. For example, if your adapter does not support incremental models, you should disable the test, [by marking it with `skip` or `xfail`](https://docs.pytest.org/en/latest/how-to/skipping.html), as well as noting that limitation in any documentation, READMEs, and usage guides that accompany your adapter. - -2. **Optional tests**, for second-order functionality that is common across plugins, but not required for basic use. Your plugin can opt into these test cases by inheriting existing ones, or reimplementing them with adjustments. For now, this category includes all tests located outside the `basic` subdirectory. More tests will be added as we convert older tests defined on dbt-core and mature plugins to use the standard framework. - -3. **Custom tests**, for behavior that is specific to your adapter / data platform. Each has its own specialties and idiosyncracies. We encourage you to use the same `pytest`-based framework, utilities, and fixtures to write your own custom tests for functionality that is unique to your adapter. - -If you run into an issue with the core framework, or the basic/optional test cases—or if you've written a custom test that you believe would be relevant and useful for other adapter plugin developers—please open an issue or PR in the `dbt-core` repository on GitHub. - -## Getting started running basic tests - -In this section, we'll walk through the three steps to start running our basic test cases on your adapter plugin: - -1. Install dependencies -2. Set up and configure pytest -3. Define test cases - -### Install dependencies - -You should already have a virtual environment with `dbt-core` and your adapter plugin installed. You'll also need to install: -- [`pytest`](https://pypi.org/project/pytest/) -- [`dbt-tests-adapter`](https://pypi.org/project/dbt-tests-adapter/), the set of common test cases -- (optional) [`pytest` plugins](https://docs.pytest.org/en/7.0.x/reference/plugin_list.html)--we'll use `pytest-dotenv` below - -Or specify all dependencies in a requirements file like: - - -```txt -pytest -pytest-dotenv -dbt-tests-adapter -``` - - -```sh -pip install -r dev_requirements.txt -``` - -### Set up and configure pytest - -First, set yourself up to run `pytest` by creating a file named `pytest.ini` at the root of your repository: - - - -```python -[pytest] -filterwarnings = - ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning - ignore:unclosed file .*:ResourceWarning -env_files = - test.env # uses pytest-dotenv plugin - # this allows you to store env vars for database connection in a file named test.env - # rather than passing them in every CLI command, or setting in `PYTEST_ADDOPTS` - # be sure to add "test.env" to .gitignore as well! -testpaths = - tests/functional # name per convention -``` - - - -Then, create a configuration file within your tests directory. In it, you'll want to define all necessary profile configuration for connecting to your data platform in local development and continuous integration. We recommend setting these values with environment variables, since this file will be checked into version control. - - - -```python -import pytest -import os - -# Import the standard functional fixtures as a plugin -# Note: fixtures with session scope need to be local -pytest_plugins = ["dbt.tests.fixtures.project"] - -# The profile dictionary, used to write out profiles.yml -# dbt will supply a unique schema per test, so we do not specify 'schema' here -@pytest.fixture(scope="class") -def dbt_profile_target(): - return { - 'type': '', - 'threads': 1, - 'host': os.getenv('HOST_ENV_VAR_NAME'), - 'user': os.getenv('USER_ENV_VAR_NAME'), - ... - } -``` - - - -### Define test cases - -As in the example above, each test case is defined as a class, and has its own "project" setup. To get started, you can import all basic test cases and try running them without changes. - - - -```python -import pytest - -from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations -from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests -from dbt.tests.adapter.basic.test_singular_tests_ephemeral import BaseSingularTestsEphemeral -from dbt.tests.adapter.basic.test_empty import BaseEmpty -from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral -from dbt.tests.adapter.basic.test_incremental import BaseIncremental -from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests -from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols -from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp -from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod - -class TestSimpleMaterializationsMyAdapter(BaseSimpleMaterializations): - pass - - -class TestSingularTestsMyAdapter(BaseSingularTests): - pass - - -class TestSingularTestsEphemeralMyAdapter(BaseSingularTestsEphemeral): - pass - - -class TestEmptyMyAdapter(BaseEmpty): - pass - - -class TestEphemeralMyAdapter(BaseEphemeral): - pass - - -class TestIncrementalMyAdapter(BaseIncremental): - pass - - -class TestGenericTestsMyAdapter(BaseGenericTests): - pass - - -class TestSnapshotCheckColsMyAdapter(BaseSnapshotCheckCols): - pass - - -class TestSnapshotTimestampMyAdapter(BaseSnapshotTimestamp): - pass - - -class TestBaseAdapterMethod(BaseAdapterMethod): - pass -``` - - - - -Finally, run pytest: -```sh -python3 -m pytest tests/functional -``` - -### Modifying test cases - -You may need to make slight modifications in a specific test case to get it passing on your adapter. The mechanism to do this is simple: rather than simply inheriting the "base" test with `pass`, you can redefine any of its fixtures or test methods. - -For instance, on Redshift, we need to explicitly cast a column in the fixture input seed to use data type `varchar(64)`: - - - -```python -import pytest -from dbt.tests.adapter.basic.files import seeds_base_csv, seeds_added_csv, seeds_newcolumns_csv -from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols - -# set the datatype of the name column in the 'added' seed so it -# can hold the '_update' that's added -schema_seed_added_yml = """ -version: 2 -seeds: - - name: added - config: - column_types: - name: varchar(64) -""" - -class TestSnapshotCheckColsRedshift(BaseSnapshotCheckCols): - # Redshift defines the 'name' column such that it's not big enough - # to hold the '_update' added in the test. - @pytest.fixture(scope="class") - def models(self): - return { - "base.csv": seeds_base_csv, - "added.csv": seeds_added_csv, - "seeds.yml": schema_seed_added_yml, - } -``` - - - -As another example, the `dbt-bigquery` adapter asks users to "authorize" replacing a with a by supplying the `--full-refresh` flag. The reason: In the table logic, a view by the same name must first be dropped; if the table query fails, the model will be missing. - -Knowing this possibility, the "base" test case offers a `require_full_refresh` switch on the `test_config` fixture class. For BigQuery, we'll switch it on: - - - -```python -import pytest -from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations - -class TestSimpleMaterializationsBigQuery(BaseSimpleMaterializations): - @pytest.fixture(scope="class") - def test_config(self): - # effect: add '--full-refresh' flag in requisite 'dbt run' step - return {"require_full_refresh": True} -``` - - - -It's always worth asking whether the required modifications represent gaps in perceived or expected dbt functionality. Are these simple implementation details, which any user of this database would understand? Are they limitations worth documenting? - -If, on the other hand, they represent poor assumptions in the "basic" test cases, which fail to account for a common pattern in other types of databases-—please open an issue or PR in the `dbt-core` repository on GitHub. - -### Running with multiple profiles - -Some databases support multiple connection methods, which map to actually different functionality behind the scenes. For instance, the `dbt-spark` adapter supports connections to Apache Spark clusters _and_ Databricks runtimes, which supports additional functionality out of the box, enabled by the Delta file format. - - - -```python -def pytest_addoption(parser): - parser.addoption("--profile", action="store", default="apache_spark", type=str) - - -# Using @pytest.mark.skip_profile('apache_spark') uses the 'skip_by_profile_type' -# autouse fixture below -def pytest_configure(config): - config.addinivalue_line( - "markers", - "skip_profile(profile): skip test for the given profile", - ) - -@pytest.fixture(scope="session") -def dbt_profile_target(request): - profile_type = request.config.getoption("--profile") - elif profile_type == "databricks_sql_endpoint": - target = databricks_sql_endpoint_target() - elif profile_type == "apache_spark": - target = apache_spark_target() - else: - raise ValueError(f"Invalid profile type '{profile_type}'") - return target - -def apache_spark_target(): - return { - "type": "spark", - "host": "localhost", - ... - } - -def databricks_sql_endpoint_target(): - return { - "type": "spark", - "host": os.getenv("DBT_DATABRICKS_HOST_NAME"), - ... - } - -@pytest.fixture(autouse=True) -def skip_by_profile_type(request): - profile_type = request.config.getoption("--profile") - if request.node.get_closest_marker("skip_profile"): - for skip_profile_type in request.node.get_closest_marker("skip_profile").args: - if skip_profile_type == profile_type: - pytest.skip("skipped on '{profile_type}' profile") -``` - - - -If there are tests that _shouldn't_ run for a given profile: - - - -```python -# Snapshots require access to the Delta file format, available on our Databricks connection, -# so let's skip on Apache Spark -@pytest.mark.skip_profile('apache_spark') -class TestSnapshotCheckColsSpark(BaseSnapshotCheckCols): - @pytest.fixture(scope="class") - def project_config_update(self): - return { - "seeds": { - "+file_format": "delta", - }, - "snapshots": { - "+file_format": "delta", - } - } -``` - - - -Finally: -```sh -python3 -m pytest tests/functional --profile apache_spark -python3 -m pytest tests/functional --profile databricks_sql_endpoint -``` diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md deleted file mode 100644 index f8335dfcbc4..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md +++ /dev/null @@ -1,65 +0,0 @@ ---- -title: "Documenting a new adapter" -id: "5-documenting-a-new-adapter" ---- - -If you've already [built](3-building-a-new-adapter), and [tested](4-testing-a-new-adapter) your adapter, it's time to document it so the dbt community will know that it exists and how to use it. - -## Making your adapter available - -Many community members maintain their adapter plugins under open source licenses. If you're interested in doing this, we recommend: -- Hosting on a public git provider (for example, GitHub or Gitlab) -- Publishing to [PyPI](https://pypi.org/) -- Adding to the list of ["Supported Data Platforms"](/docs/supported-data-platforms#community-supported) (more info below) - -## General Guidelines - -To best inform the dbt community of the new adapter, you should contribute to the dbt's open-source documentation site, which uses the [Docusaurus project](https://docusaurus.io/). This is the site you're currently on! - -### Conventions - -Each `.md` file you create needs a header as shown below. The document id will also need to be added to the config file: `website/sidebars.js`. - -```md ---- -title: "Documenting a new adapter" -id: "documenting-a-new-adapter" ---- -``` - -### Single Source of Truth - -We ask our adapter maintainers to use the [docs.getdbt.com repo](https://github.com/dbt-labs/docs.getdbt.com) (i.e. this site) as the single-source-of-truth for documentation rather than having to maintain the same set of information in three different places. The adapter repo's `README.md` and the data platform's documentation pages should simply link to the corresponding page on this docs site. Keep reading for more information on what should and shouldn't be included on the dbt docs site. - -### Assumed Knowledge - -To simplify things, assume the reader of this documentation already knows how both dbt and your data platform works. There's already great material for how to learn dbt and the data platform out there. The documentation we're asking you to add should be what a user who is already profiecient in both dbt and your data platform would need to know in order to use both. Effectively that boils down to two things: how to connect, and how to configure. - - -## Topics and Pages to Cover - - -The following subjects need to be addressed across three pages of this docs site to have your data platform be listed on our documentation. After the corresponding pull request is merged, we ask that you link to these pages from your adapter repo's `REAMDE` as well as from your product documentation. - - To contribute, all you will have to do make the changes listed in the table below. - - - - -| How To... | File to change within `/website/docs/` | Action | Info to Include | -|----------------------|--------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Connect | `/docs/core/connect-data-platform/{MY-DATA-PLATFORM}-setup.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. | -| Configure | `reference/resource-configs/{MY-DATA-PLATFORM}-configs.md` | Create | What options and configuration specific to your data platform do users need to know? e.g. table distribution and indexing options, column_quoting policy, which incremental strategies are supported | -| Discover and Install | `docs/supported-data-platforms.md` | Modify | Is it a vendor- or community- supported adapter? How to install Python adapter package? Ideally with pip and PyPI hosted package, but can also use `git+` link to GitHub Repo | -| Add link to sidebar | `website/sidebars.js` | Modify | Add the document id to the correct location in the sidebar menu | - -For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-setup.md` and add it to the `/website/docs/core/connect-data-platform/` directory. - - -## Example PRs to add new adapter documentation - -Below are some recent pull requests made by partners to document their data platform's adapter: - -- [TiDB](https://github.com/dbt-labs/docs.getdbt.com/pull/1309) -- [SingleStore](https://github.com/dbt-labs/docs.getdbt.com/pull/1044) -- [Firebolt](https://github.com/dbt-labs/docs.getdbt.com/pull/941) diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md deleted file mode 100644 index 9bf2f949bef..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md +++ /dev/null @@ -1,120 +0,0 @@ ---- -title: "Promoting a new adapter" -id: "6-promoting-a-new-adapter" ---- - -## Model for engagement in the dbt community - -The most important thing here is recognizing that people are successful in the community when they join, first and foremost, to engage authentically. - -What does authentic engagement look like? It’s challenging to define explicit rules. One good rule of thumb is to treat people with dignity and respect. - -Contributors to the community should think of contribution *as the end itself,* not a means toward other business KPIs (leads, community members, etc.). [We are a mission-driven company.](https://www.getdbt.com/dbt-labs/values/) Some ways to know if you’re authentically engaging: - -- Is an engagement’s *primary* purpose of sharing knowledge and resources or building brand engagement? -- Imagine you didn’t work at the org you do — can you imagine yourself still writing this? -- Is it written in formal / marketing language, or does it sound like you, the human? - -## Who should join the dbt community slack - -### People who have insight into what it means to do hands-on [analytics engineering](https://www.getdbt.com/analytics-engineering/) work - -The dbt Community Slack workspace is fundamentally a place for analytics practitioners to interact with each other — the closer the users are in the community to actual data/analytics engineering work, the more natural their engagement will be (leading to better outcomes for partners and the community). - -### DevRel practitioners with strong focus - -DevRel practitioners often have a strong analytics background and a good understanding of the community. It’s essential to be sure they are focused on *contributing,* not on driving community metrics for partner org (such as signing people up for their slack or events). The metrics will rise naturally through authentic engagement. - -### Founder and executives who are interested in directly engaging with the community - -This is either incredibly successful or not at all depending on the profile of the founder. Typically, this works best when the founder has a practitioner-level of technical understanding and is interested in joining not to promote, but to learn and hear from users. - -### Software Engineers at partner products that are building and supporting integrations with either dbt Core or dbt Cloud - -This is successful when the engineers are familiar with dbt as a product or at least have taken our training course. The Slack is often a place where end-user questions and feedback is initially shared, so it is recommended that someone technical from the team be present. There are also a handful of channels aimed at those building integrations, which tend to be a font of knowledge. - -### Who might struggle in the dbt community -#### People in marketing roles -dbt Slack is not a marketing channel. Attempts to use it as such invariably fall flat and can even lead to people having a negative view of a product. This doesn’t mean that dbt can’t serve marketing objectives, but a long-term commitment to engagement is the only proven method to do this sustainably. - -#### People in product roles -The dbt Community can be an invaluable source of feedback on a product. There are two primary ways this can happen — organically (community members proactively suggesting a new feature) and via direct calls for feedback and user research. Immediate calls for engagement must be done in your dedicated #tools channel. Direct calls should be used sparingly, as they can overwhelm more organic discussions and feedback. - -## Who is the audience for an adapter release - -A new adapter is likely to drive huge community interest from several groups of people: -- People who are currently using the database that the adapter is supporting -- People who may be adopting the database in the near future. -- People who are interested in dbt development in general. - -The database users will be your primary audience and the most helpful in achieving success. Engage them directly in the adapter’s dedicated Slack channel. If one does not exist already, reach out in #channel-requests, and we will get one made for you and include it in an announcement about new channels. - -The final group is where non-slack community engagement becomes important. Twitter and LinkedIn are both great places to interact with a broad audience. A well-orchestrated adapter release can generate impactful and authentic engagement. - -## How to message the initial rollout and follow-up content - -Tell a story that engages dbt users and the community. Highlight new use cases and functionality unlocked by the adapter in a way that will resonate with each segment. - -### Existing users of your technology who are new to dbt - -- Provide a general overview of the value dbt will deliver to your users. This can lean on dbt's messaging and talking points which are laid out in the [dbt viewpoint.](/community/resources/viewpoint) - - Give examples of a rollout that speaks to the overall value of dbt and your product. - -### Users who are already familiar with dbt and the community -- Consider unique use cases or advantages your adapter provide over existing adapters. Who will be excited for this? -- Contribute to the dbt Community and ensure that dbt users on your adapter are well supported (tutorial content, packages, documentation, etc). -- Example of a rollout that is compelling for those familiar with dbt: [Firebolt](https://www.linkedin.com/feed/update/urn:li:activity:6879090752459182080/) - -## Tactically manage distribution of content about new or existing adapters - -There are tactical pieces on how and where to share that help ensure success. - -### On slack: -- #i-made-this channel — this channel has a policy against “marketing” and “content marketing” posts, but it should be successful if you write your content with the above guidelines in mind. Even with that, it’s important to post here sparingly. -- Your own database / tool channel — this is where the people who have opted in to receive communications from you and always a great place to share things that are relevant to them. - -### On social media: -- Twitter -- LinkedIn -- Social media posts *from the author* or an individual connected to the project tend to have better engagement than posts from a company or organization account. -- Ask your partner representative about: - - Retweets and shares from the official dbt Labs accounts. - - Flagging posts internally at dbt Labs to get individual employees to share. - -## Measuring engagement - -You don’t need 1000 people in a channel to succeed, but you need at least a few active participants who can make it feel lived in. If you’re comfortable working in public, this could be members of your team, or it can be a few people who you know that are highly engaged and would be interested in participating. Having even 2 or 3 regulars hanging out in a channel is all that’s needed for a successful start and is, in fact, much more impactful than 250 people that never post. - -## How to announce a new adapter - -We’d recommend *against* boilerplate announcements and encourage finding a unique voice. That being said, there are a couple of things that we’d want to include: - -- A summary of the value prop of your database / technology for users who aren’t familiar. -- The personas that might be interested in this news. -- A description of what the adapter *is*. For example: - > With the release of our new dbt adapter, you’ll be able to to use dbt to model and transform your data in [name-of-your-org] -- Particular or unique use cases or functionality unlocked by the adapter. -- Plans for future / ongoing support / development. -- The link to the documentation for using the adapter on the dbt Labs docs site. -- An announcement blog. - -## Announcing new release versions of existing adapters - -This can vary substantially depending on the nature of the release but a good baseline is the types of release messages that [we put out in the #dbt-releases](https://getdbt.slack.com/archives/C37J8BQEL/p1651242161526509) channel. - -![Full Release Post](/img/adapter-guide/0-full-release-notes.png) - -Breaking this down: - -- Visually distinctive announcement - make it clear this is a release - -- Short written description of what is in the release - -- Links to additional resources - -- Implementation instructions: - -- Future plans - -- Contributor recognition (if applicable) - diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md deleted file mode 100644 index 6310569dfad..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Verifying a new adapter" -id: "7-verifying-a-new-adapter" ---- - -## Why verify an adapter? - -The very first data platform dbt supported was Redshift followed quickly by Postgres (([dbt-core#174](https://github.com/dbt-labs/dbt-core/pull/174)). In 2017, back when dbt Labs (née Fishtown Analytics) was still a data consultancy, we added support for Snowflake and BigQuery. We also turned dbt's database support into an adapter framework ([dbt-core#259](https://github.com/dbt-labs/dbt-core/pull/259/)), and a plugin system a few years later. For years, dbt Labs specialized in those four data platforms and became experts in them. However, the surface area of all possible databases, their respective nuances, and keeping them up-to-date and bug-free is a Herculean and/or Sisyphean task that couldn't be done by a single person or even a single team! Enter the dbt community which enables dbt Core to work on more than 30 different databases (32 as of Sep '22)! - -Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a *good thing*, however it requires due dilligence that wasn't required in a paid-license, closed-source software world. Before taking a dependency on an open-source projet is is important to determine the answer to the following questions: - -1. Does it work? -2. Does it meet my team's specific use case? -3. Does anyone "own" the code, or is anyone liable for ensuring it works? -4. Do bugs get fixed quickly? -5. Does it stay up-to-date with new Core features? -6. Is the usage substantial enough to self-sustain? -7. What risks do I take on by taking a dependency on this library? - -These are valid, important questions to answer—especially given that `dbt-core` itself only put out its first stable release (major version v1.0) in December 2021! Indeed, up until now, the majority of new user questions in database-specific channels are some form of: -- "How mature is `dbt-`? Any gotchas I should be aware of before I start exploring?" -- "has anyone here used `dbt-` for production models?" -- "I've been playing with `dbt-` -- I was able to install and run my initial experiments. I noticed that there are certain features mentioned on the documentation that are marked as 'not ok' or 'not tested'. What are the risks? -I'd love to make a statement on my team to adopt DBT [sic], but I'm pretty sure questions will be asked around the possible limitations of the adapter or if there are other companies out there using dbt [sic] with Oracle DB in production, etc." - -There has been a tendency to trust the dbt Labs-maintained adapters over community- and vendor-supported adapters, but repo ownership is only one among many indicators of software quality. We aim to help our users feel well-informed as to the caliber of an adapter with a new program. - -## Verified by dbt Labs - -The adapter verification program aims to quickly indicate to users which adapters can be trusted to use in production. Previously, doing so was uncharted territory for new users and complicated making the business case to their leadership team. We plan to give quality assurances by: -1. appointing a key stakeholder for the adapter repository, -2. ensuring that the chosen stakeholder fixes bugs and cuts new releases in a timely manner see maintainer your adapter (["Maintaining your new adapter"](2-prerequisites-for-a-new-adapter#maintaining-your-new-adapter)), -3. demonstrating that it passes our adapter pytest suite tests, -4. assuring that it works for us internally and ideally an existing team using the adapter in production . - - -Every major & minor version of a adapter will be verified internally and given an official :white_check_mark: (custom emoji coming soon), on the ["Supported Data Platforms"](/docs/supported-data-platforms) page. - -## How to get an adapter verified? - -We envision that data platform vendors will be most interested in having their adapter versions verified, however we are open to community adapter verification. If interested, please reach out either to the `partnerships` at `dbtlabs.com` or post in the [#adapter-ecosystem Slack channel](https://getdbt.slack.com/archives/C030A0UF5LM). diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development b/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development deleted file mode 100644 index 8b137891791..00000000000 --- a/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development +++ /dev/null @@ -1 +0,0 @@ - diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md deleted file mode 100644 index b03cb2ca013..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -title: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake" -id: "1-overview-dbt-python-snowpark" -description: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake" ---- - -The focus of this workshop will be to demonstrate how we can use both *SQL and python together* in the same workflow to run *both analytics and machine learning models* on dbt Cloud. - -All code in today’s workshop can be found on [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1). - -## What you'll use during the lab - -- A [Snowflake account](https://trial.snowflake.com/) with ACCOUNTADMIN access -- A [dbt Cloud account](https://www.getdbt.com/signup/) - -## What you'll learn - -- How to build scalable data transformation pipelines using dbt, and Snowflake using SQL and Python -- How to leverage copying data into Snowflake from a public S3 bucket - -## What you need to know - -- Basic to intermediate SQL and python. -- Basic understanding of dbt fundamentals. We recommend the [dbt Fundamentals course](https://courses.getdbt.com/collections) if you're interested. -- High level machine learning process (encoding, training, testing) -- Simple ML algorithms — we will use logistic regression to keep the focus on the *workflow*, not algorithms! - -## What you'll build - -- A set of data analytics and prediction pipelines using Formula 1 data leveraging dbt and Snowflake, making use of best practices like data quality tests and code promotion between environments -- We will create insights for: - 1. Finding the lap time average and rolling average through the years (is it generally trending up or down)? - 2. Which constructor has the fastest pit stops in 2021? - 3. Predicting the position of each driver given using a decade of data (2010 - 2020) - -As inputs, we are going to leverage Formula 1 datasets hosted on a dbt Labs public S3 bucket. We will create a Snowflake Stage for our CSV files then use Snowflake’s `COPY INTO` function to copy the data in from our CSV files into tables. The Formula 1 is available on [Kaggle](https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020). The data is originally compiled from the [Ergast Developer API](http://ergast.com/mrd/). - -Overall we are going to set up the environments, build scalable pipelines in dbt, establish data tests, and promote code to production. diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md deleted file mode 100644 index 446981214e3..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md +++ /dev/null @@ -1,150 +0,0 @@ ---- -title: "Python transformations!" -id: "10-python-transformations" -description: "Python transformations" ---- - -Up until now, SQL has been driving the project (car pun intended) for data cleaning and hierarchical joining. Now it’s time for Python to take the wheel (car pun still intended) for the rest of our lab! For more information about running Python models on dbt, check out our [docs](/docs/build/python-models). To learn more about dbt python works under the hood, check out [Snowpark for Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html), which makes running dbt Python models possible. - -There are quite a few differences between SQL and Python in terms of the dbt syntax and DDL, so we’ll be breaking our code and model runs down further for our python models. - -## Pit stop analysis - -First, we want to find out: which constructor had the fastest pit stops in 2021? (constructor is a Formula 1 team that builds or “constructs” the car). - -1. Create a new file called `fastest_pit_stops_by_constructor.py` in our `aggregates` (this is the first time we are using the `.py` extension!). -2. Copy the following code into the file: - ```python - import numpy as np - import pandas as pd - - def model(dbt, session): - # dbt configuration - dbt.config(packages=["pandas","numpy"]) - - # get upstream data - pit_stops_joined = dbt.ref("pit_stops_joined").to_pandas() - - # provide year so we do not hardcode dates - year=2021 - - # describe the data - pit_stops_joined["PIT_STOP_SECONDS"] = pit_stops_joined["PIT_STOP_MILLISECONDS"]/1000 - fastest_pit_stops = pit_stops_joined[(pit_stops_joined["RACE_YEAR"]==year)].groupby(by="CONSTRUCTOR_NAME")["PIT_STOP_SECONDS"].describe().sort_values(by='mean') - fastest_pit_stops.reset_index(inplace=True) - fastest_pit_stops.columns = fastest_pit_stops.columns.str.upper() - - return fastest_pit_stops.round(2) - ``` - -3. Let’s break down what this code is doing step by step: - - First, we are importing the Python libraries that we are using. A *library* is a reusable chunk of code that someone else wrote that you may want to include in your programs/projects. We are using `numpy` and `pandas`in this Python model. This is similar to a dbt *package*, but our Python libraries do *not* persist across the entire project. - - Defining a function called `model` with the parameter `dbt` and `session`. The parameter `dbt` is a class compiled by dbt, which enables you to run your Python code in the context of your dbt project and DAG. The parameter `session` is a class representing your Snowflake’s connection to the Python backend. The `model` function *must return a single DataFrame*. You can see that all the data transformation happening is within the body of the `model` function that the `return` statement is tied to. - - Then, within the context of our dbt model library, we are passing in a configuration of which packages we need using `dbt.config(packages=["pandas","numpy"])`. - - Use the `.ref()` function to retrieve the data frame `pit_stops_joined` that we created in our last step using SQL. We cast this to a pandas dataframe (by default it's a Snowpark Dataframe). - - Create a variable named `year` so we aren’t passing a hardcoded value. - - Generate a new column called `PIT_STOP_SECONDS` by dividing the value of `PIT_STOP_MILLISECONDS` by 1000. - - Create our final data frame `fastest_pit_stops` that holds the records where year is equal to our year variable (2021 in this case), then group the data frame by `CONSTRUCTOR_NAME` and use the `describe()` and `sort_values()` and in descending order. This will make our first row in the new aggregated data frame the team with the fastest pit stops over an entire competition year. - - Finally, it resets the index of the `fastest_pit_stops` data frame. The `reset_index()` method allows you to reset the index back to the default 0, 1, 2, etc indexes. By default, this method will keep the "old" indexes in a column named "index"; to avoid this, use the drop parameter. Think of this as keeping your data “flat and square” as opposed to “tiered”. If you are new to Python, now might be a good time to [learn about indexes for 5 minutes](https://towardsdatascience.com/the-basics-of-indexing-and-slicing-python-lists-2d12c90a94cf) since it's the foundation of how Python retrieves, slices, and dices data. The `inplace` argument means we override the existing data frame permanently. Not to fear! This is what we want to do to avoid dealing with multi-indexed dataframes! - - Convert our Python column names to all uppercase using `.upper()`, so Snowflake recognizes them. - - Finally we are returning our dataframe with 2 decimal places for all the columns using the `round()` method. -4. Zooming out a bit, what are we doing differently here in Python from our typical SQL code: - - Method chaining is a technique in which multiple methods are called on an object in a single statement, with each method call modifying the result of the previous one. The methods are called in a chain, with the output of one method being used as the input for the next one. The technique is used to simplify the code and make it more readable by eliminating the need for intermediate variables to store the intermediate results. - - The way you see method chaining in Python is the syntax `.().()`. For example, `.describe().sort_values(by='mean')` where the `.describe()` method is chained to `.sort_values()`. - - The `.describe()` method is used to generate various summary statistics of the dataset. It's used on pandas dataframe. It gives a quick and easy way to get the summary statistics of your dataset without writing multiple lines of code. - - The `.sort_values()` method is used to sort a pandas dataframe or a series by one or multiple columns. The method sorts the data by the specified column(s) in ascending or descending order. It is the pandas equivalent to `order by` in SQL. - - We won’t go as in depth for our subsequent scripts, but will continue to explain at a high level what new libraries, functions, and methods are doing. - -5. Build the model using the UI which will **execute**: - ```bash - dbt run --select fastest_pit_stops_by_constructor - ``` - in the command bar. - - Let’s look at some details of our first Python model to see what our model executed. There two major differences we can see while running a Python model compared to an SQL model: - - - Our Python model was executed as a stored procedure. Snowflake needs a way to know that it's meant to execute this code in a Python runtime, instead of interpreting in a SQL runtime. We do this by creating a Python stored proc, called by a SQL command. - - The `snowflake-snowpark-python` library has been picked up to execute our Python code. Even though this wasn’t explicitly stated this is picked up by the dbt class object because we need our Snowpark package to run Python! - - Python models take a bit longer to run than SQL models, however we could always speed this up by using [Snowpark-optimized Warehouses](https://docs.snowflake.com/en/user-guide/warehouses-snowpark-optimized.html) if we wanted to. Our data is sufficiently small, so we won’t worry about creating a separate warehouse for Python versus SQL files today. - - - The rest of our **Details** output gives us information about how dbt and Snowpark for Python are working together to define class objects and apply a specific set of methods to run our models. - - So which constructor had the fastest pit stops in 2021? Let’s look at our data to find out! - -6. We can't preview Python models directly, so let’s create a new file using the **+** button or the Control-n shortcut to create a new scratchpad. -7. Reference our Python model: - ```sql - select * from {{ ref('fastest_pit_stops_by_constructor') }} - ``` - and preview the output: - - - Not only did Red Bull have the fastest average pit stops by nearly 40 seconds, they also had the smallest standard deviation, meaning they are both fastest and most consistent teams in pit stops. By using the `.describe()` method we were able to avoid verbose SQL requiring us to create a line of code per column and repetitively use the `PERCENTILE_COUNT()` function. - - Now we want to find the lap time average and rolling average through the years (is it generally trending up or down)? - -8. Create a new file called `lap_times_moving_avg.py` in our `aggregates` folder. -9. Copy the following code into the file: - ```python - import pandas as pd - - def model(dbt, session): - # dbt configuration - dbt.config(packages=["pandas"]) - - # get upstream data - lap_times = dbt.ref("int_lap_times_years").to_pandas() - - # describe the data - lap_times["LAP_TIME_SECONDS"] = lap_times["LAP_TIME_MILLISECONDS"]/1000 - lap_time_trends = lap_times.groupby(by="RACE_YEAR")["LAP_TIME_SECONDS"].mean().to_frame() - lap_time_trends.reset_index(inplace=True) - lap_time_trends["LAP_MOVING_AVG_5_YEARS"] = lap_time_trends["LAP_TIME_SECONDS"].rolling(5).mean() - lap_time_trends.columns = lap_time_trends.columns.str.upper() - - return lap_time_trends.round(1) - ``` - -10. Breaking down our code a bit: - - We’re only using the `pandas` library for this model and casting it to a pandas data frame `.to_pandas()`. - - Generate a new column called `LAP_TIMES_SECONDS` by dividing the value of `LAP_TIME_MILLISECONDS` by 1000. - - Create the final dataframe. Get the lap time per year. Calculate the mean series and convert to a data frame. - - Reset the index. - - Calculate the rolling 5 year mean. - - Round our numeric columns to one decimal place. -11. Now, run this model by using the UI **Run model** or - ```bash - dbt run --select lap_times_moving_avg - ``` - in the command bar. - -12. Once again previewing the output of our data using the same steps for our `fastest_pit_stops_by_constructor` model. - - - We can see that it looks like lap times are getting consistently faster over time. Then in 2010 we see an increase occur! Using outside subject matter context, we know that significant rule changes were introduced to Formula 1 in 2010 and 2011 causing slower lap times. - -13. Now is a good time to checkpoint and commit our work to Git. Click **Commit and push** and give your commit a message like `aggregate python models` before moving on. - -## The dbt model, .source(), .ref() and .config() functions - -Let’s take a step back before starting machine learning to both review and go more in-depth at the methods that make running dbt python models possible. If you want to know more outside of this lab’s explanation read the documentation [here](/docs/build/python-models?version=1.3). - -- dbt model(dbt, session). For starters, each Python model lives in a .py file in your models/ folder. It defines a function named `model()`, which takes two parameters: - - dbt — A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG. - - session — A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames and to write DataFrames back to tables. In PySpark, by convention, the SparkSession is named spark, and available globally. For consistency across platforms, we always pass it into the model function as an explicit argument called session. -- The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame. -- `.source()` and `.ref()` functions. Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. If you want to read directly from a raw source table, use `dbt.source()`. We saw this in our earlier section using SQL with the source function. These functions have the same execution, but with different syntax. Use the `dbt.ref()` method within a Python model to read data from other models (SQL or Python). These methods return DataFrames pointing to the upstream source, model, seed, or snapshot. -- `.config()`. Just like SQL models, there are three ways to configure Python models: - - In a dedicated `.yml` file, within the `models/` directory - - Within the model's `.py` file, using the `dbt.config()` method - - Calling the `dbt.config()` method will set configurations for your model within your `.py` file, similar to the `{{ config() }} macro` in `.sql` model files: - ```python - def model(dbt, session): - - # setting configuration - dbt.config(materialized="table") - ``` - - There's a limit to how complex you can get with the `dbt.config()` method. It accepts only literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `.config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the config property in a [YAML file](/reference/resource-properties/config). Learn more about configurations [here](/reference/model-configs). diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md deleted file mode 100644 index bde163b59db..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md +++ /dev/null @@ -1,225 +0,0 @@ ---- -title: "Machine Learning prep: cleaning, encoding, and splits, oh my!" -id: "11-machine-learning-prep" -description: "Machine Learning prep" ---- -Now that we’ve gained insights and business intelligence about Formula 1 at a descriptive level, we want to extend our capabilities into prediction. We’re going to take the scenario where we censor the data. This means that we will pretend that we will train a model using earlier data and apply it to future data. In practice, this means we’ll take data from 2010-2019 to train our model and then predict 2020 data. - -In this section, we’ll be preparing our data to predict the final race position of a driver. - -At a high level we’ll be: - -- Creating new prediction features and filtering our dataset to active drivers -- Encoding our data (algorithms like numbers) and simplifying our target variable called `position` -- Splitting our dataset into training, testing, and validation - -## ML data prep - -1. To keep our project organized, we’ll need to create two new subfolders in our `ml` directory. Under the `ml` folder, make the subfolders `prep` and `train_predict`. -2. Create a new file under `ml/prep` called `ml_data_prep`. Copy the following code into the file and **Save**. - ```python - import pandas as pd - - def model(dbt, session): - # dbt configuration - dbt.config(packages=["pandas"]) - - # get upstream data - fct_results = dbt.ref("fct_results").to_pandas() - - # provide years so we do not hardcode dates in filter command - start_year=2010 - end_year=2020 - - # describe the data for a full decade - data = fct_results.loc[fct_results['RACE_YEAR'].between(start_year, end_year)] - - # convert string to an integer - data['POSITION'] = data['POSITION'].astype(float) - - # we cannot have nulls if we want to use total pit stops - data['TOTAL_PIT_STOPS_PER_RACE'] = data['TOTAL_PIT_STOPS_PER_RACE'].fillna(0) - - # some of the constructors changed their name over the year so replacing old names with current name - mapping = {'Force India': 'Racing Point', 'Sauber': 'Alfa Romeo', 'Lotus F1': 'Renault', 'Toro Rosso': 'AlphaTauri'} - data['CONSTRUCTOR_NAME'].replace(mapping, inplace=True) - - # create confidence metrics for drivers and constructors - dnf_by_driver = data.groupby('DRIVER').sum()['DNF_FLAG'] - driver_race_entered = data.groupby('DRIVER').count()['DNF_FLAG'] - driver_dnf_ratio = (dnf_by_driver/driver_race_entered) - driver_confidence = 1-driver_dnf_ratio - driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence)) - - dnf_by_constructor = data.groupby('CONSTRUCTOR_NAME').sum()['DNF_FLAG'] - constructor_race_entered = data.groupby('CONSTRUCTOR_NAME').count()['DNF_FLAG'] - constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered) - constructor_relaiblity = 1-constructor_dnf_ratio - constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity)) - - data['DRIVER_CONFIDENCE'] = data['DRIVER'].apply(lambda x:driver_confidence_dict[x]) - data['CONSTRUCTOR_RELAIBLITY'] = data['CONSTRUCTOR_NAME'].apply(lambda x:constructor_relaiblity_dict[x]) - - #removing retired drivers and constructors - active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes', - 'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull', - 'Haas F1 Team'] - active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz', - 'Valtteri Bottas', 'Lance Stroll', 'George Russell', - 'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen', - 'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat', - 'Max Verstappen', 'Pierre Gasly', 'Alexander Albon', - 'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi', - 'Romain Grosjean','Nicholas Latifi'] - - # create flags for active drivers and constructors so we can filter downstream - data['ACTIVE_DRIVER'] = data['DRIVER'].apply(lambda x: int(x in active_drivers)) - data['ACTIVE_CONSTRUCTOR'] = data['CONSTRUCTOR_NAME'].apply(lambda x: int(x in active_constructors)) - - return data - ``` -3. As usual, let’s break down what we are doing in this Python model: - - We’re first referencing our upstream `fct_results` table and casting it to a pandas dataframe. - - Filtering on years 2010-2020 since we’ll need to clean all our data we are using for prediction (both training and testing). - - Filling in empty data for `total_pit_stops` and making a mapping active constructors and drivers to avoid erroneous predictions - - ⚠️ You might be wondering why we didn’t do this upstream in our `fct_results` table! The reason for this is that we want our machine learning cleanup to reflect the year 2020 for our predictions and give us an up-to-date team name. However, for business intelligence purposes we can keep the historical data at that point in time. Instead of thinking of one table as “one source of truth” we are creating different datasets fit for purpose: one for historical descriptions and reporting and another for relevant predictions. - - Create new confidence features for drivers and constructors - - Generate flags for the constructors and drivers that were active in 2020 -4. Execute the following in the command bar: - ```bash - dbt run --select ml_data_prep - ``` -5. There are more aspects we could consider for this project, such as normalizing the driver confidence by the number of races entered. Including this would help account for a driver’s history and consider whether they are a new or long-time driver. We’re going to keep it simple for now, but these are some of the ways we can expand and improve our machine learning dbt projects. Breaking down our machine learning prep model: - - Lambda functions — We use some lambda functions to transform our data without having to create a fully-fledged function using the `def` notation. So what exactly are lambda functions? - - In Python, a lambda function is a small, anonymous function defined using the keyword "lambda". Lambda functions are used to perform a quick operation, such as a mathematical calculation or a transformation on a list of elements. They are often used in conjunction with higher-order functions, such as `apply`, `map`, `filter`, and `reduce`. - - `.apply()` method — We used `.apply()` to pass our functions into our lambda expressions to the columns and perform this multiple times in our code. Let’s explain apply a little more: - - The `.apply()` function in the pandas library is used to apply a function to a specified axis of a DataFrame or a Series. In our case the function we used was our lambda function! - - The `.apply()` function takes two arguments: the first is the function to be applied, and the second is the axis along which the function should be applied. The axis can be specified as 0 for rows or 1 for columns. We are using the default value of 0 so we aren’t explicitly writing it in the code. This means that the function will be applied to each *row* of the DataFrame or Series. -6. Let’s look at the preview of our clean dataframe after running our `ml_data_prep` model: - - -## Covariate encoding - -In this next part, we’ll be performing covariate encoding. Breaking down this phrase a bit, a *covariate* is a variable that is relevant to the outcome of a study or experiment, and *encoding* refers to the process of converting data (such as text or categorical variables) into a numerical format that can be used as input for a model. This is necessary because most machine learning algorithms can only work with numerical data. Algorithms don’t speak languages, have eyes to see images, etc. so we encode our data into numbers so algorithms can perform tasks by using calculations they otherwise couldn’t. - -🧠 We’ll think about this as : “algorithms like numbers”. - -1. Create a new file under `ml/prep` called `covariate_encoding` copy the code below and save. - ```python - import pandas as pd - import numpy as np - from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder - from sklearn.linear_model import LogisticRegression - - def model(dbt, session): - # dbt configuration - dbt.config(packages=["pandas","numpy","scikit-learn"]) - - # get upstream data - data = dbt.ref("ml_data_prep").to_pandas() - - # list out covariates we want to use in addition to outcome variable we are modeling - position - covariates = data[['RACE_YEAR','CIRCUIT_NAME','GRID','CONSTRUCTOR_NAME','DRIVER','DRIVERS_AGE_YEARS','DRIVER_CONFIDENCE','CONSTRUCTOR_RELAIBLITY','TOTAL_PIT_STOPS_PER_RACE','ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR', 'POSITION']] - - # filter covariates on active drivers and constructors - # use fil_cov as short for "filtered_covariates" - fil_cov = covariates[(covariates['ACTIVE_DRIVER']==1)&(covariates['ACTIVE_CONSTRUCTOR']==1)] - - # Encode categorical variables using LabelEncoder - # TODO: we'll update this to both ohe in the future for non-ordinal variables! - le = LabelEncoder() - fil_cov['CIRCUIT_NAME'] = le.fit_transform(fil_cov['CIRCUIT_NAME']) - fil_cov['CONSTRUCTOR_NAME'] = le.fit_transform(fil_cov['CONSTRUCTOR_NAME']) - fil_cov['DRIVER'] = le.fit_transform(fil_cov['DRIVER']) - fil_cov['TOTAL_PIT_STOPS_PER_RACE'] = le.fit_transform(fil_cov['TOTAL_PIT_STOPS_PER_RACE']) - - # Simply target variable "position" to represent 3 meaningful categories in Formula1 - # 1. Podium position 2. Points for team 3. Nothing - no podium or points! - def position_index(x): - if x<4: - return 1 - if x>10: - return 3 - else : - return 2 - - # we are dropping the columns that we filtered on in addition to our training variable - encoded_data = fil_cov.drop(['ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR'],1) - encoded_data['POSITION_LABEL']= encoded_data['POSITION'].apply(lambda x: position_index(x)) - encoded_data_grouped_target = encoded_data.drop(['POSITION'],1) - - return encoded_data_grouped_target - ``` -2. Execute the following in the command bar: - ```bash - dbt run --select covariate_encoding - ``` -3. In this code, we are using a ton of functions from libraries! This is really cool, because we can utilize code other people have developed and bring it into our project simply by using the `import` function. [Scikit-learn](https://scikit-learn.org/stable/), “sklearn” for short, is an extremely popular data science library. Sklearn contains a wide range of machine learning techniques, including supervised and unsupervised learning algorithms, feature scaling and imputation, as well as tools model evaluation and selection. We’ll be using Sklearn for both preparing our covariates and creating models (our next section). -4. Our dataset is pretty small data so we are good to use pandas and `sklearn`. If you have larger data for your own project in mind, consider `dask` or `category_encoders`. -5. Breaking it down a bit more: - - We’re selecting a subset of variables that will be used as predictors for a driver’s position. - - Filter the dataset to only include rows using the active driver and constructor flags we created in the last step. - - The next step is to use the `LabelEncoder` from scikit-learn to convert the categorical variables `CIRCUIT_NAME`, `CONSTRUCTOR_NAME`, `DRIVER`, and `TOTAL_PIT_STOPS_PER_RACE` into numerical values. - - Create a new variable called `POSITION_LABEL`, which is a derived from our position variable. - - 💭 Why are we changing our position variable? There are 20 total positions in Formula 1 and we are grouping them together to simplify the classification and improve performance. We also want to demonstrate you can create a new function within your dbt model! - - Our new `position_label` variable has meaning: - - In Formula1 if you are in: - - Top 3 you get a “podium” position - - Top 10 you gain points that add to your overall season total - - Below top 10 you get no points! - - We are mapping our original variable position to `position_label` to the corresponding places above to 1,2, and 3 respectively. - - Drop the active driver and constructor flags since they were filter criteria and additionally drop our original position variable. - -## Splitting into training and testing datasets - -Now that we’ve cleaned and encoded our data, we are going to further split in by time. In this step, we will create dataframes to use for training and prediction. We’ll be creating two dataframes 1) using data from 2010-2019 for training, and 2) data from 2020 for new prediction inferences. We’ll create variables called `start_year` and `end_year` so we aren’t filtering on hardcasted values (and can more easily swap them out in the future if we want to retrain our model on different timeframes). - -1. Create a file called `train_test_dataset` copy and save the following code: - ```python - import pandas as pd - - def model(dbt, session): - - # dbt configuration - dbt.config(packages=["pandas"], tags="train") - - # get upstream data - encoding = dbt.ref("covariate_encoding").to_pandas() - - # provide years so we do not hardcode dates in filter command - start_year=2010 - end_year=2019 - - # describe the data for a full decade - train_test_dataset = encoding.loc[encoding['RACE_YEAR'].between(start_year, end_year)] - - return train_test_dataset - ``` - -2. Create a file called `hold_out_dataset_for_prediction` copy and save the following code below. Now we’ll have a dataset with only the year 2020 that we’ll keep as a hold out set that we are going to use similar to a deployment use case. - ```python - import pandas as pd - - def model(dbt, session): - # dbt configuration - dbt.config(packages=["pandas"], tags="predict") - - # get upstream data - encoding = dbt.ref("covariate_encoding").to_pandas() - - # variable for year instead of hardcoding it - year=2020 - - # filter the data based on the specified year - hold_out_dataset = encoding.loc[encoding['RACE_YEAR'] == year] - - return hold_out_dataset - ``` -3. Execute the following in the command bar: - ```bash - dbt run --select train_test_dataset hold_out_dataset_for_prediction - ``` - To run our temporal data split models, we can use this syntax in the command line to run them both at once. Make sure you use a *space* [syntax](/reference/node-selection/syntax) between the model names to indicate you want to run both! -4. **Commit and push** our changes to keep saving our work as we go using `ml data prep and splits` before moving on. - -👏 Now that we’ve finished our machine learning prep work we can move onto the fun part — training and prediction! diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md deleted file mode 100644 index 8b353a85fa3..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md +++ /dev/null @@ -1,251 +0,0 @@ ---- -title: "Machine Learning: training and prediction " -id: "12-machine-learning-training-prediction" -description: "Machine Learning: training and prediction" ---- - -We’re ready to start training a model to predict the driver’s position. Now is a good time to pause and take a step back and say, usually in ML projects you’ll try multiple algorithms during development and use an evaluation method such as cross validation to determine which algorithm to use. You can definitely do this in your dbt project, but for the content of this lab we’ll have decided on using a logistic regression to predict position (we actually tried some other algorithms using cross validation outside of this lab such as k-nearest neighbors and a support vector classifier but that didn’t perform as well as the logistic regression and a decision tree that overfit). - -There are 3 areas to break down as we go since we are working at the intersection all within one model file: -1. Machine Learning -2. Snowflake and Snowpark -3. dbt Python models - -If you haven’t seen code like this before or use joblib files to save machine learning models, we’ll be going over them at a high level and you can explore the links for more technical in-depth along the way! Because Snowflake and dbt have abstracted away a lot of the nitty gritty about serialization and storing our model object to be called again, we won’t go into too much detail here. There’s *a lot* going on here so take it at your pace! - -## Training and saving a machine learning model - -1. Project organization remains key, so let’s make a new subfolder called `train_predict` under the `ml` folder. -2. Now create a new file called `train_test_position` and copy and save the following code: - - ```python - import snowflake.snowpark.functions as F - from sklearn.model_selection import train_test_split - import pandas as pd - from sklearn.metrics import confusion_matrix, balanced_accuracy_score - import io - from sklearn.linear_model import LogisticRegression - from joblib import dump, load - import joblib - import logging - import sys - from joblib import dump, load - - logger = logging.getLogger("mylog") - - def save_file(session, model, path, dest_filename): - input_stream = io.BytesIO() - joblib.dump(model, input_stream) - session._conn.upload_stream(input_stream, path, dest_filename) - return "successfully created file: " + path - - def model(dbt, session): - dbt.config( - packages = ['numpy','scikit-learn','pandas','numpy','joblib','cachetools'], - materialized = "table", - tags = "train" - ) - # Create a stage in Snowflake to save our model file - session.sql('create or replace stage MODELSTAGE').collect() - - #session._use_scoped_temp_objects = False - version = "1.0" - logger.info('Model training version: ' + version) - - # read in our training and testing upstream dataset - test_train_df = dbt.ref("train_test_dataset") - - # cast snowpark df to pandas df - test_train_pd_df = test_train_df.to_pandas() - target_col = "POSITION_LABEL" - - # split out covariate predictors, x, from our target column position_label, y. - split_X = test_train_pd_df.drop([target_col], axis=1) - split_y = test_train_pd_df[target_col] - - # Split out our training and test data into proportions - X_train, X_test, y_train, y_test = train_test_split(split_X, split_y, train_size=0.7, random_state=42) - train = [X_train, y_train] - test = [X_test, y_test] - # now we are only training our one model to deploy - # we are keeping the focus on the workflows and not algorithms for this lab! - model = LogisticRegression() - - # fit the preprocessing pipeline and the model together - model.fit(X_train, y_train) - y_pred = model.predict_proba(X_test)[:,1] - predictions = [round(value) for value in y_pred] - balanced_accuracy = balanced_accuracy_score(y_test, predictions) - - # Save the model to a stage - save_file(session, model, "@MODELSTAGE/driver_position_"+version, "driver_position_"+version+".joblib" ) - logger.info('Model artifact:' + "@MODELSTAGE/driver_position_"+version+".joblib") - - # Take our pandas training and testing dataframes and put them back into snowpark dataframes - snowpark_train_df = session.write_pandas(pd.concat(train, axis=1, join='inner'), "train_table", auto_create_table=True, create_temp_table=True) - snowpark_test_df = session.write_pandas(pd.concat(test, axis=1, join='inner'), "test_table", auto_create_table=True, create_temp_table=True) - - # Union our training and testing data together and add a column indicating train vs test rows - return snowpark_train_df.with_column("DATASET_TYPE", F.lit("train")).union(snowpark_test_df.with_column("DATASET_TYPE", F.lit("test"))) - ``` - -3. Execute the following in the command bar: - ```bash - dbt run --select train_test_position - ``` -4. Breaking down our Python script here: - - We’re importing some helpful libraries. - - Defining a function called `save_file()` that takes four parameters: `session`, `model`, `path` and `dest_filename` that will save our logistic regression model file. - - `session` — an object representing a connection to Snowflake. - - `model` — an object that needs to be saved. In this case, it's a Python object that is a scikit-learn that can be serialized with joblib. - - `path` — a string representing the directory or bucket location where the file should be saved. - - `dest_filename` — a string representing the desired name of the file. - - Creating our dbt model - - Within this model we are creating a stage called `MODELSTAGE` to place our logistic regression `joblib` model file. This is really important since we need a place to keep our model to reuse and want to ensure it's there. When using Snowpark commands, it's common to see the `.collect()` method to ensure the action is performed. Think of the session as our “start” and collect as our “end” when [working with Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) (you can use other ending methods other than collect). - - Using `.ref()` to connect into our `train_test_dataset` model. - - Now we see the machine learning part of our analysis: - - Create new dataframes for our prediction features from our target variable `position_label`. - - Split our dataset into 70% training (and 30% testing), train_size=0.7 with a `random_state` specified to have repeatable results. - - Specify our model is a logistic regression. - - Fit our model. In a logistic regression this means finding the coefficients that will give the least classification error. - - Round our predictions to the nearest integer since logistic regression creates a probability between for each class and calculate a balanced accuracy to account for imbalances in the target variable. - - Right now our model is only in memory, so we need to use our nifty function `save_file` to save our model file to our Snowflake stage. We save our model as a joblib file so Snowpark can easily call this model object back to create predictions. We really don’t need to know much else as a data practitioner unless we want to. It’s worth noting that joblib files aren’t able to be queried directly by SQL. To do this, we would need to transform the joblib file to an SQL querable format such as JSON or CSV (out of scope for this workshop). - - Finally we want to return our dataframe, but create a new column indicating what rows were used for training and those for training. -5. Viewing our output of this model: - - -6. Let’s pop back over to Snowflake and check that our logistic regression model has been stored in our `MODELSTAGE` using the command: - ```sql - list @modelstage - ``` - - -7. To investigate the commands run as part of `train_test_position` script, navigate to Snowflake query history to view it **Activity > Query History**. We can view the portions of query that we wrote such as `create or replace stage MODELSTAGE`, but we also see additional queries that Snowflake uses to interpret python code. - - -## Predicting on new data - -1. Create a new file called `predict_position` and copy and save the following code: - ```python - import logging - import joblib - import pandas as pd - import os - from snowflake.snowpark import types as T - - DB_STAGE = 'MODELSTAGE' - version = '1.0' - # The name of the model file - model_file_path = 'driver_position_'+version - model_file_packaged = 'driver_position_'+version+'.joblib' - - # This is a local directory, used for storing the various artifacts locally - LOCAL_TEMP_DIR = f'/tmp/driver_position' - DOWNLOAD_DIR = os.path.join(LOCAL_TEMP_DIR, 'download') - TARGET_MODEL_DIR_PATH = os.path.join(LOCAL_TEMP_DIR, 'ml_model') - TARGET_LIB_PATH = os.path.join(LOCAL_TEMP_DIR, 'lib') - - # The feature columns that were used during model training - # and that will be used during prediction - FEATURE_COLS = [ - "RACE_YEAR" - ,"CIRCUIT_NAME" - ,"GRID" - ,"CONSTRUCTOR_NAME" - ,"DRIVER" - ,"DRIVERS_AGE_YEARS" - ,"DRIVER_CONFIDENCE" - ,"CONSTRUCTOR_RELAIBLITY" - ,"TOTAL_PIT_STOPS_PER_RACE"] - - def register_udf_for_prediction(p_predictor ,p_session ,p_dbt): - - # The prediction udf - - def predict_position(p_df: T.PandasDataFrame[int, int, int, int, - int, int, int, int, int]) -> T.PandasSeries[int]: - # Snowpark currently does not set the column name in the input dataframe - # The default col names are like 0,1,2,... Hence we need to reset the column - # names to the features that we initially used for training. - p_df.columns = [*FEATURE_COLS] - - # Perform prediction. this returns an array object - pred_array = p_predictor.predict(p_df) - # Convert to series - df_predicted = pd.Series(pred_array) - return df_predicted - - # The list of packages that will be used by UDF - udf_packages = p_dbt.config.get('packages') - - predict_position_udf = p_session.udf.register( - predict_position - ,name=f'predict_position' - ,packages = udf_packages - ) - return predict_position_udf - - def download_models_and_libs_from_stage(p_session): - p_session.file.get(f'@{DB_STAGE}/{model_file_path}/{model_file_packaged}', DOWNLOAD_DIR) - - def load_model(p_session): - # Load the model and initialize the predictor - model_fl_path = os.path.join(DOWNLOAD_DIR, model_file_packaged) - predictor = joblib.load(model_fl_path) - return predictor - - # ------------------------------- - def model(dbt, session): - dbt.config( - packages = ['snowflake-snowpark-python' ,'scipy','scikit-learn' ,'pandas' ,'numpy'], - materialized = "table", - tags = "predict" - ) - session._use_scoped_temp_objects = False - download_models_and_libs_from_stage(session) - predictor = load_model(session) - predict_position_udf = register_udf_for_prediction(predictor, session ,dbt) - - # Retrieve the data, and perform the prediction - hold_out_df = (dbt.ref("hold_out_dataset_for_prediction") - .select(*FEATURE_COLS) - ) - - # Perform prediction. - new_predictions_df = hold_out_df.withColumn("position_predicted" - ,predict_position_udf(*FEATURE_COLS) - ) - - return new_predictions_df - ``` -2. Execute the following in the command bar: - ```bash - dbt run --select predict_position - ``` -3. **Commit and push** our changes to keep saving our work as we go using the commit message `logistic regression model training and application` before moving on. -4. At a high level in this script, we are: - - Retrieving our staged logistic regression model - - Loading the model in - - Placing the model within a user defined function (UDF) to call in line predictions on our driver’s position -5. At a more detailed level: - - Import our libraries. - - Create variables to reference back to the `MODELSTAGE` we just created and stored our model to. - - The temporary file paths we created might look intimidating, but all we’re doing here is programmatically using an initial file path and adding to it to create the following directories: - - LOCAL_TEMP_DIR ➡️ /tmp/driver_position - - DOWNLOAD_DIR ➡️ /tmp/driver_position/download - - TARGET_MODEL_DIR_PATH ➡️ /tmp/driver_position/ml_model - - TARGET_LIB_PATH ➡️ /tmp/driver_position/lib - - Provide a list of our feature columns that we used for model training and will now be used on new data for prediction. - - Next, we are creating our main function `register_udf_for_prediction(p_predictor ,p_session ,p_dbt):`. This function is used to register a user-defined function (UDF) that performs the machine learning prediction. It takes three parameters: `p_predictor` is an instance of the machine learning model, `p_session` is an instance of the Snowflake session, and `p_dbt` is an instance of the dbt library. The function creates a UDF named `predict_churn` which takes a pandas dataframe with the input features and returns a pandas series with the predictions. - - ⚠️ Pay close attention to the whitespace here. We are using a function within a function for this script. - - We have 2 simple functions that are programmatically retrieving our file paths to first get our stored model out of our `MODELSTAGE` and downloaded into the session `download_models_and_libs_from_stage` and then to load the contents of our model in (parameters) in `load_model` to use for prediction. - - Take the model we loaded in and call it `predictor` and wrap it in a UDF. - - Return our dataframe with both the features used to predict and the new label. - -🧠 Another way to read this script is from the bottom up. This can help us progressively see what is going into our final dbt model and work backwards to see how the other functions are being referenced. - -6. Let’s take a look at our predicted position alongside our feature variables. Open a new scratchpad and use the following query. I chose to order by the prediction of who would obtain a podium position: - ```sql - select * from {{ ref('predict_position') }} order by position_predicted - ``` -7. We can see that we created predictions in our final dataset, we are ready to move on to testing! diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md deleted file mode 100644 index bcda9a775fb..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md +++ /dev/null @@ -1,136 +0,0 @@ ---- -title: "Testing" -id: "13-testing" -description: "Testing" ---- -We have now completed building all the models for today’s lab, but how do we know if they meet our assertions? Put another way, how do we know the quality of our data models are any good? This brings us to testing! - -We test data models for mainly two reasons: - -- Ensure that our source data is clean on ingestion before we start data modeling/transformation (aka avoid garbage in, garbage out problem). -- Make sure we don’t introduce bugs in the transformation code we wrote (stop ourselves from creating bad joins/fanouts). - -Testing in dbt comes in two flavors: [generic](/docs/build/tests#generic-tests) and [singular](/docs/build/tests#singular-tests). - -You define them in a test block (similar to a macro) and once defined, you can reference them by name in your `.yml` files (applying them to models, columns, sources, snapshots, and seeds). - -You might be wondering: *what about testing Python models?* - -Since the output of our Python models are tables, we can test SQL and Python models the same way! We don’t have to worry about any syntax differences when testing SQL versus Python data models. This means we use `.yml` and `.sql` files to test our entities (tables, views, etc.). Under the hood, dbt is running an SQL query on our tables to see if they meet assertions. If no rows are returned, dbt will surface a passed test. Conversely, if a test results in returned rows, it will fail or warn depending on the configuration (more on that later). - -## Generic tests - -1. To implement generic out-of-the-box tests dbt comes with, we can use YAML files to specify information about our models. To add generic tests to our aggregates model, create a file called `aggregates.yml`, copy the code block below into the file, and save. - - - ```yaml - version: 2 - - models: - - name: fastest_pit_stops_by_constructor - description: Use the python .describe() method to retrieve summary statistics table about pit stops by constructor. Sort by average stop time ascending so the first row returns the fastest constructor. - columns: - - name: constructor_name - description: team that makes the car - tests: - - unique - - - name: lap_times_moving_avg - description: Use the python .rolling() method to calculate the 5 year rolling average of pit stop times alongside the average for each year. - columns: - - name: race_year - description: year of the race - tests: - - relationships: - to: ref('int_lap_times_years') - field: race_year - ``` - -2. Let’s unpack the code we have here. We have both our aggregates models with the model name to know the object we are referencing and the description of the model that we’ll populate in our documentation. At the column level (a level below our model), we are providing the column name followed by our tests. We want to ensure our `constructor_name` is unique since we used a pandas `groupby` on `constructor_name` in the model `fastest_pit_stops_by_constructor`. Next, we want to ensure our `race_year` has referential integrity from the model we selected from `int_lap_times_years` into our subsequent `lap_times_moving_avg` model. -3. Finally, if we want to see how tests were deployed on sources and SQL models, we can look at other files in our project such as the `f1_sources.yml` we created in our Sources and staging section. - -## Using macros for testing - -1. Under your `macros` folder, create a new file and name it `test_all_values_gte_zero.sql`. Copy the code block below and save the file. For clarity, “gte” is an abbreviation for greater than or equal to. - - - ```sql - {% macro test_all_values_gte_zero(table, column) %} - - select * from {{ ref(table) }} where {{ column }} < 0 - - {% endmacro %} - ``` - -2. Macros in Jinja are pieces of code that can be reused multiple times in our SQL models — they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models. -3. We use the `{% macro %}` to indicate the start of the macro and `{% endmacro %}` for the end. The text after the beginning of the macro block is the name we are giving the macro to later call it. In this case, our macro is called `test_all_values_gte_zero`. Macros take in *arguments* to pass through, in this case the `table` and the `column`. In the body of the macro, we see an SQL statement that is using the `ref` function to dynamically select the table and then the column. You can always view macros without having to run them by using `dbt run-operation`. You can learn more [here](https://docs.getdbt.com/reference/commands/run-operation). -4. Great, now we want to reference this macro as a test! Let’s create a new test file called `macro_pit_stops_mean_is_positive.sql` in our `tests` folder. - - - -5. Copy the following code into the file and save: - - ```sql - {{ - config( - enabled=true, - severity='warn', - tags = ['bi'] - ) - }} - - {{ test_all_values_gte_zero('fastest_pit_stops_by_constructor', 'mean') }} - ``` - -6. In our testing file, we are applying some configurations to the test including `enabled`, which is an optional configuration for disabling models, seeds, snapshots, and tests. Our severity is set to `warn` instead of `error`, which means our pipeline will still continue to run. We have tagged our test with `bi` since we are applying this test to one of our bi models. - -Then, in our final line, we are calling the `test_all_values_gte_zero` macro that takes in our table and column arguments and inputting our table `'fastest_pit_stops_by_constructor'` and the column `'mean'`. - -## Custom singular tests to validate Python models - -The simplest way to define a test is by writing the exact SQL that will return failing records. We call these "singular" tests, because they're one-off assertions usable for a single purpose. - -These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your test-paths config). You can use Jinja in SQL models (including ref and source) in the test definition, just like you can when creating models. Each `.sql` file contains one select statement, and it defines one test. - -Let’s add a custom test that asserts that the moving average of the lap time over the last 5 years is greater than zero (it’s impossible to have time less than 0!). It is easy to assume if this is not the case the data has been corrupted. - -1. Create a file `lap_times_moving_avg_assert_positive_or_null.sql` under the `tests` folder. - - -2. Copy the following code and save the file: - - ```sql - {{ - config( - enabled=true, - severity='error', - tags = ['bi'] - ) - }} - - with lap_times_moving_avg as ( select * from {{ ref('lap_times_moving_avg') }} ) - - select * - from lap_times_moving_avg - where lap_moving_avg_5_years < 0 and lap_moving_avg_5_years is not null - ``` - -## Putting all our tests together - -1. Time to run our tests! Altogether, we have created 4 tests for our 2 Python models: - - `fastest_pit_stops_by_constructor` - - Unique `constructor_name` - - Lap times are greater than 0 or null (to allow for the first leading values in a rolling calculation) - - `lap_times_moving_avg` - - Referential test on `race_year` - - Mean pit stop times are greater than or equal to 0 (no negative time values) -2. To run the tests on both our models, we can use this syntax in the command line to run them both at once, similar to how we did our data splits earlier. - Execute the following in the command bar: - ```bash - dbt test --select fastest_pit_stops_by_constructor lap_times_moving_avg - ``` - - -3. All 4 of our tests passed (yay for clean data)! To understand the SQL being run against each of our tables, we can click into the details of the test. -4. Navigating into the **Details** of the `unique_fastest_pit_stops_by_constructor_name`, we can see that each line `constructor_name` should only have one row. - \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md deleted file mode 100644 index 95ec8ad242f..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "Documentation" -id: "14-documentation" -description: "Documentation" ---- -When it comes to documentation, dbt brings together both column and model level descriptions that you can provide as well as details from your Snowflake information schema in a static site for consumption by other data team members and stakeholders. - -We are going to revisit 2 areas of our project to understand our documentation: - -- `intermediate.md` file -- `dbt_project.yml` file - -To start, let’s look back at our `intermediate.md` file. We can see that we provided multi-line descriptions for the models in our intermediate models using [docs blocks](/docs/collaborate/documentation#using-docs-blocks). Then we reference these docs blocks in our `.yml` file. Building descriptions with doc blocks in Markdown files gives you the ability to format your descriptions with Markdown and are particularly helpful when building long descriptions, either at the column or model level. In our `dbt_project.yml`, we added `node_colors` at folder levels. - -1. To see all these pieces come together, execute this in the command bar: - ```bash - dbt docs generate - ``` - This will generate the documentation for your project. Click the book button, as shown in the screenshot below to access the docs. - - -2. Go to our project area and view `int_results`. View the description that we created in our doc block. - - -3. View the mini-lineage that looks at the model we are currently selected on (`int_results` in this case). - - -4. In our `dbt_project.yml`, we configured `node_colors` depending on the file directory. Starting in dbt v1.3, we can see how our lineage in our docs looks. By color coding your project, it can help you cluster together similar models or steps and more easily troubleshoot. - \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md deleted file mode 100644 index d9cedb60861..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: "Deployment" -id: "15-deployment" -description: "Deployment" ---- - -Before we jump into deploying our code, let's have a quick primer on environments. Up to this point, all of the work we've done in the dbt Cloud IDE has been in our development environment, with code committed to a feature branch and the models we've built created in our development schema in Snowflake as defined in our Development environment connection. Doing this work on a feature branch, allows us to separate our code from what other coworkers are building and code that is already deemed production ready. Building models in a development schema in Snowflake allows us to separate the database objects we might still be modifying and testing from the database objects running production dashboards or other downstream dependencies. Together, the combination of a Git branch and Snowflake database objects form our environment. - -Now that we've completed testing and documenting our work, we're ready to deploy our code from our development environment to our production environment and this involves two steps: - -- Promoting code from our feature branch to the production branch in our repository. - - Generally, the production branch is going to be named your main branch and there's a review process to go through before merging code to the main branch of a repository. Here we are going to merge without review for ease of this workshop. -- Deploying code to our production environment. - - Once our code is merged to the main branch, we'll need to run dbt in our production environment to build all of our models and run all of our tests. This will allow us to build production-ready objects into our production environment in Snowflake. Luckily for us, the Partner Connect flow has already created our deployment environment and job to facilitate this step. - -1. Before getting started, let's make sure that we've committed all of our work to our feature branch. If you still have work to commit, you'll be able to select the **Commit and push**, provide a message, and then select **Commit** again. -2. Once all of your work is committed, the git workflow button will now appear as **Merge to main**. Select **Merge to main** and the merge process will automatically run in the background. - - -3. When it's completed, you should see the git button read **Create branch** and the branch you're currently looking at will become **main**. -4. Now that all of our development work has been merged to the main branch, we can build our deployment job. Given that our production environment and production job were created automatically for us through Partner Connect, all we need to do here is update some default configurations to meet our needs. -5. In the menu, select **Deploy** **> Environments** - - -6. You should see two environments listed and you'll want to select the **Deployment** environment then **Settings** to modify it. -7. Before making any changes, let's touch on what is defined within this environment. The Snowflake connection shows the credentials that dbt Cloud is using for this environment and in our case they are the same as what was created for us through Partner Connect. Our deployment job will build in our `PC_DBT_DB` database and use the default Partner Connect role and warehouse to do so. The deployment credentials section also uses the info that was created in our Partner Connect job to create the credential connection. However, it is using the same default schema that we've been using as the schema for our development environment. -8. Let's update the schema to create a new schema specifically for our production environment. Click **Edit** to allow you to modify the existing field values. Navigate to **Deployment Credentials >** **schema.** -9. Update the schema name to **production**. Remember to select **Save** after you've made the change. - -10. By updating the schema for our production environment to **production**, it ensures that our deployment job for this environment will build our dbt models in the **production** schema within the `PC_DBT_DB` database as defined in the Snowflake Connection section. -11. Now let's switch over to our production job. Click on the deploy tab again and then select **Jobs**. You should see an existing and preconfigured **Partner Connect Trial Job**. Similar to the environment, click on the job, then select **Settings** to modify it. Let's take a look at the job to understand it before making changes. - - - The Environment section is what connects this job with the environment we want it to run in. This job is already defaulted to use the Deployment environment that we just updated and the rest of the settings we can keep as is. - - The Execution settings section gives us the option to generate docs, run source freshness, and defer to a previous run state. For the purposes of our lab, we're going to keep these settings as is as well and stick with just generating docs. - - The Commands section is where we specify exactly which commands we want to run during this job, and we also want to keep this as is. We want our seed to be uploaded first, then run our models, and finally test them. The order of this is important as well, considering that we need our seed to be created before we can run our incremental model, and we need our models to be created before we can test them. - - Finally, we have the Triggers section, where we have a number of different options for scheduling our job. Given that our data isn't updating regularly here and we're running this job manually for now, we're also going to leave this section alone. - - So, what are we changing then? Just the name! Click **Edit** to allow you to make changes. Then update the name of the job to **Production Job** to denote this as our production deployment job. After that's done, click **Save**. -12. Now let's go to run our job. Clicking on the job name in the path at the top of the screen will take you back to the job run history page where you'll be able to click **Run run** to kick off the job. If you encounter any job failures, try running the job again before further troubleshooting. - - - -13. Let's go over to Snowflake to confirm that everything built as expected in our production schema. Refresh the database objects in your Snowflake account and you should see the production schema now within our default Partner Connect database. If you click into the schema and everything ran successfully, you should be able to see all of the models we developed. - - -## Conclusion - -Fantastic! You’ve finished the workshop! We hope you feel empowered in using both SQL and Python in your dbt Cloud workflows with Snowflake. Having a reliable pipeline to surface both analytics and machine learning is crucial to creating tangible business value from your data. - -For more help and information join our [dbt community Slack](https://www.getdbt.com/community/) which contains more than 50,000 data practitioners today. We have a dedicated slack channel #db-snowflake to Snowflake related content. Happy dbt'ing! \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md deleted file mode 100644 index e864c363a44..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: "Configure Snowflake" -id: "2-snowflake-configuration" -description: "Configure Snowflake" ---- - - -1. Log in to your trial Snowflake account. You can [sign up for a Snowflake Trial Account using this form](https://signup.snowflake.com/) if you don’t have one. -2. Ensure that your account is set up using **AWS** in the **US East (N. Virginia)**. We will be copying the data from a public AWS S3 bucket hosted by dbt Labs in the us-east-1 region. By ensuring our Snowflake environment setup matches our bucket region, we avoid any multi-region data copy and retrieval latency issues. - - - -3. After creating your account and verifying it from your sign-up email, Snowflake will direct you back to the UI called Snowsight. - -4. When Snowsight first opens, your window should look like the following, with you logged in as the ACCOUNTADMIN with demo worksheets open: - - - - -5. Navigate to **Admin > Billing & Terms**. Click **Enable > Acknowledge & Continue** to enable Anaconda Python Packages to run in Snowflake. - - - - - -6. Finally, create a new Worksheet by selecting **+ Worksheet** in the upper right corner. - diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md deleted file mode 100644 index 9a41e7f45c5..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md +++ /dev/null @@ -1,192 +0,0 @@ ---- -title: "Connect to data source" -id: "3-connect-to-data-source" -description: "Connect to data source" ---- - -We need to obtain our data source by copying our Formula 1 data into Snowflake tables from a public S3 bucket that dbt Labs hosts. - -1. When a new Snowflake account is created, there should be a preconfigured warehouse in your account named `COMPUTE_WH`. -2. If for any reason your account doesn’t have this warehouse, we can create a warehouse using the following script: - - ```sql - create or replace warehouse COMPUTE_WH with warehouse_size=XSMALL - ``` -3. Rename the worksheet to `data setup script` since we will be placing code in this worksheet to ingest the Formula 1 data. Make sure you are still logged in as the **ACCOUNTADMIN** and select the **COMPUTE_WH** warehouse. - - - -4. Copy the following code into the main body of the Snowflake worksheet. You can also find this setup script under the `setup` folder in the [Git repository](https://github.com/dbt-labs/python-snowpark-formula1/blob/main/setup/setup_script_s3_to_snowflake.sql). The script is long since it's bring in all of the data we'll need today! - - ```sql - -- create and define our formula1 database - create or replace database formula1; - use database formula1; - create or replace schema raw; - use schema raw; - - -- define our file format for reading in the csvs - create or replace file format csvformat - type = csv - field_delimiter =',' - field_optionally_enclosed_by = '"', - skip_header=1; - - -- - create or replace stage formula1_stage - file_format = csvformat - url = 's3://formula1-dbt-cloud-python-demo/formula1-kaggle-data/'; - - -- load in the 8 tables we need for our demo - -- we are first creating the table then copying our data in from s3 - -- think of this as an empty container or shell that we are then filling - create or replace table formula1.raw.circuits ( - CIRCUITID NUMBER(38,0), - CIRCUITREF VARCHAR(16777216), - NAME VARCHAR(16777216), - LOCATION VARCHAR(16777216), - COUNTRY VARCHAR(16777216), - LAT FLOAT, - LNG FLOAT, - ALT NUMBER(38,0), - URL VARCHAR(16777216) - ); - -- copy our data from public s3 bucket into our tables - copy into circuits - from @formula1_stage/circuits.csv - on_error='continue'; - - create or replace table formula1.raw.constructors ( - CONSTRUCTORID NUMBER(38,0), - CONSTRUCTORREF VARCHAR(16777216), - NAME VARCHAR(16777216), - NATIONALITY VARCHAR(16777216), - URL VARCHAR(16777216) - ); - copy into constructors - from @formula1_stage/constructors.csv - on_error='continue'; - - create or replace table formula1.raw.drivers ( - DRIVERID NUMBER(38,0), - DRIVERREF VARCHAR(16777216), - NUMBER VARCHAR(16777216), - CODE VARCHAR(16777216), - FORENAME VARCHAR(16777216), - SURNAME VARCHAR(16777216), - DOB DATE, - NATIONALITY VARCHAR(16777216), - URL VARCHAR(16777216) - ); - copy into drivers - from @formula1_stage/drivers.csv - on_error='continue'; - - create or replace table formula1.raw.lap_times ( - RACEID NUMBER(38,0), - DRIVERID NUMBER(38,0), - LAP NUMBER(38,0), - POSITION FLOAT, - TIME VARCHAR(16777216), - MILLISECONDS NUMBER(38,0) - ); - copy into lap_times - from @formula1_stage/lap_times.csv - on_error='continue'; - - create or replace table formula1.raw.pit_stops ( - RACEID NUMBER(38,0), - DRIVERID NUMBER(38,0), - STOP NUMBER(38,0), - LAP NUMBER(38,0), - TIME VARCHAR(16777216), - DURATION VARCHAR(16777216), - MILLISECONDS NUMBER(38,0) - ); - copy into pit_stops - from @formula1_stage/pit_stops.csv - on_error='continue'; - - create or replace table formula1.raw.races ( - RACEID NUMBER(38,0), - YEAR NUMBER(38,0), - ROUND NUMBER(38,0), - CIRCUITID NUMBER(38,0), - NAME VARCHAR(16777216), - DATE DATE, - TIME VARCHAR(16777216), - URL VARCHAR(16777216), - FP1_DATE VARCHAR(16777216), - FP1_TIME VARCHAR(16777216), - FP2_DATE VARCHAR(16777216), - FP2_TIME VARCHAR(16777216), - FP3_DATE VARCHAR(16777216), - FP3_TIME VARCHAR(16777216), - QUALI_DATE VARCHAR(16777216), - QUALI_TIME VARCHAR(16777216), - SPRINT_DATE VARCHAR(16777216), - SPRINT_TIME VARCHAR(16777216) - ); - copy into races - from @formula1_stage/races.csv - on_error='continue'; - - create or replace table formula1.raw.results ( - RESULTID NUMBER(38,0), - RACEID NUMBER(38,0), - DRIVERID NUMBER(38,0), - CONSTRUCTORID NUMBER(38,0), - NUMBER NUMBER(38,0), - GRID NUMBER(38,0), - POSITION FLOAT, - POSITIONTEXT VARCHAR(16777216), - POSITIONORDER NUMBER(38,0), - POINTS NUMBER(38,0), - LAPS NUMBER(38,0), - TIME VARCHAR(16777216), - MILLISECONDS NUMBER(38,0), - FASTESTLAP NUMBER(38,0), - RANK NUMBER(38,0), - FASTESTLAPTIME VARCHAR(16777216), - FASTESTLAPSPEED FLOAT, - STATUSID NUMBER(38,0) - ); - copy into results - from @formula1_stage/results.csv - on_error='continue'; - - create or replace table formula1.raw.status ( - STATUSID NUMBER(38,0), - STATUS VARCHAR(16777216) - ); - copy into status - from @formula1_stage/status.csv - on_error='continue'; - - ``` -5. Ensure all the commands are selected before running the query — an easy way to do this is to use Ctrl-a to highlight all of the code in the worksheet. Select **run** (blue triangle icon). Notice how the dot next to your **COMPUTE_WH** turns from gray to green as you run the query. The **status** table is the final table of all 8 tables loaded in. - - - -6. Let’s unpack that pretty long query we ran into component parts. We ran this query to load in our 8 Formula 1 tables from a public S3 bucket. To do this, we: - - Created a new database called `formula1` and a schema called `raw` to place our raw (untransformed) data into. - - Defined our file format for our CSV files. Importantly, here we use a parameter called `field_optionally_enclosed_by =` since the string columns in our Formula 1 csv files use quotes. Quotes are used around string values to avoid parsing issues where commas `,` and new lines `/n` in data values could cause data loading errors. - - Created a stage to locate our data we are going to load in. Snowflake Stages are locations where data files are stored. Stages are used to both load and unload data to and from Snowflake locations. Here we are using an external stage, by referencing an S3 bucket. - - Created our tables for our data to be copied into. These are empty tables with the column name and data type. Think of this as creating an empty container that the data will then fill into. - - Used the `copy into` statement for each of our tables. We reference our staged location we created and upon loading errors continue to load in the rest of the data. You should not have data loading errors but if you do, those rows will be skipped and Snowflake will tell you which rows caused errors - -7. Now let's take a look at some of our cool Formula 1 data we just loaded up! - 1. Create a new worksheet by selecting the **+** then **New Worksheet**. - - 2. Navigate to **Database > Formula1 > RAW > Tables**. - 3. Query the data using the following code. There are only 76 rows in the circuits table, so we don’t need to worry about limiting the amount of data we query. - ```sql - select * from formula1.raw.circuits - ``` - 4. Run the query. From here on out, we’ll use the keyboard shortcuts Command-Enter or Control-Enter to run queries and won’t explicitly call out this step. - 5. Review the query results, you should see information about Formula 1 circuits, starting with Albert Park in Australia! - 6. Finally, ensure you have all 8 tables starting with `CIRCUITS` and ending with `STATUS`. Now we are ready to connect into dbt Cloud! - - - - \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md deleted file mode 100644 index 21eaa7e8d7f..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: "Configure dbt" -id: "4-configure-dbt" -description: "Configure dbt" ---- - -1. We are going to be using [Snowflake Partner Connect](https://docs.snowflake.com/en/user-guide/ecosystem-partner-connect.html) to set up a dbt Cloud account. Using this method will allow you to spin up a fully fledged dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [managed repository](/docs/collaborate/git/managed-repository), environments, and credentials already established. -2. Navigate out of your worksheet back by selecting **home**. -3. In Snowsight, confirm that you are using the **ACCOUNTADMIN** role. -4. Navigate to the **Admin** **> Partner Connect**. Find **dbt** either by using the search bar or navigating the **Data Integration**. Select the **dbt** tile. - -5. You should now see a new window that says **Connect to dbt**. Select **Optional Grant** and add the `FORMULA1` database. This will grant access for your new dbt user role to the FORMULA1 database. - - -6. Ensure the `FORMULA1` is present in your optional grant before clicking **Connect**.  This will create a dedicated dbt user, database, warehouse, and role for your dbt Cloud trial. - - - -7. When you see the **Your partner account has been created** window, click **Activate**. - -8. You should be redirected to a dbt Cloud registration page. Fill out the form. Make sure to save the password somewhere for login in the future. - - - -9. Select **Complete Registration**. You should now be redirected to your dbt Cloud account, complete with a connection to your Snowflake account, a deployment and a development environment, and a sample job. - -10. To help you version control your dbt project, we have connected it to a [managed repository](/docs/collaborate/git/managed-repository), which means that dbt Labs will be hosting your repository for you. This will give you access to a Git workflow without you having to create and host the repository yourself. You will not need to know Git for this workshop; dbt Cloud will help guide you through the workflow. In the future, when you’re developing your own project, [feel free to use your own repository](/docs/cloud/git/connect-github). This will allow you to learn more about features like [Slim CI](/docs/deploy/continuous-integration) builds after this workshop. diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md deleted file mode 100644 index f098c47bdad..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: "Development schema name and IDE walkthrough" -id: "5-development-schema-name" -description: "Development schema name and IDE walkthrough" ---- - -1. First we are going to change the name of our default schema to where our dbt models will build. By default, the name is `dbt_`. We will change this to `dbt_` to create your own personal development schema. To do this, select **Profile Settings** from the gear icon in the upper right. - - - -2. Navigate to the **Credentials** menu and select **Partner Connect Trial**, which will expand the credentials menu. - - - -3. Click **Edit** and change the name of your schema from `dbt_` to `dbt_YOUR_NAME` replacing `YOUR_NAME` with your initials and name (`hwatson` is used in the lab screenshots). Be sure to click **Save** for your changes! - - -4. We now have our own personal development schema, amazing! When we run our first dbt models they will build into this schema. -5. Let’s open up dbt Cloud’s Integrated Development Environment (IDE) and familiarize ourselves. Choose **Develop** at the top of the UI. - -6. When the IDE is done loading, click **Initialize dbt project**. The initialization process creates a collection of files and folders necessary to run your dbt project. - - -7. After the initialization is finished, you can view the files and folders in the file tree menu. As we move through the workshop we'll be sure to touch on a few key files and folders that we'll work with to build out our project. -8. Next click **Commit and push** to commit the new files and folders from the initialize step. We always want our commit messages to be relevant to the work we're committing, so be sure to provide a message like `initialize project` and select **Commit Changes**. - - - - - -9. [Committing](https://www.atlassian.com/git/tutorials/saving-changes/git-commit) your work here will save it to the managed git repository that was created during the Partner Connect signup. This initial commit is the only commit that will be made directly to our `main` branch and from *here on out we'll be doing all of our work on a development branch*. This allows us to keep our development work separate from our production code. -10. There are a couple of key features to point out about the IDE before we get to work. It is a text editor, an SQL and Python runner, and a CLI with Git version control all baked into one package! This allows you to focus on editing your SQL and Python files, previewing the results with the SQL runner (it even runs Jinja!), and building models at the command line without having to move between different applications. The Git workflow in dbt Cloud allows both Git beginners and experts alike to be able to easily version control all of their work with a couple clicks. - - - -11. Let's run our first dbt models! Two example models are included in your dbt project in the `models/examples` folder that we can use to illustrate how to run dbt at the command line. Type `dbt run` into the command line and click **Enter** on your keyboard. When the run bar expands you'll be able to see the results of the run, where you should see the run complete successfully. - - - -12. The run results allow you to see the code that dbt compiles and sends to Snowflake for execution. To view the logs for this run, select one of the model tabs using the  **>** icon and then **Details**. If you scroll down a bit you'll be able to see the compiled code and how dbt interacts with Snowflake. Given that this run took place in our development environment, the models were created in your development schema. - - - - -13. Now let's switch over to Snowflake to confirm that the objects were actually created. Click on the three dots **…** above your database objects and then **Refresh**. Expand the **PC_DBT_DB** database and you should see your development schema. Select the schema, then **Tables**  and **Views**. Now you should be able to see `MY_FIRST_DBT_MODEL` as a table and `MY_SECOND_DBT_MODEL` as a view. - \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md deleted file mode 100644 index e387b208dd1..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -title: "Foundational structure" -id: "6-foundational-structure" -description: "Foundational structure" ---- - -In this step, we’ll need to create a development branch and set up project level configurations. - -1. To get started with development for our project, we'll need to create a new Git branch for our work. Select **create branch** and name your development branch. We'll call our branch `snowpark_python_workshop` then click **Submit**. -2. The first piece of development we'll do on the project is to update the `dbt_project.yml` file. Every dbt project requires a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. The [dbt_project.yml](/reference/dbt_project.yml) file also contains important information that tells dbt how to operate on your project. -3. Select the `dbt_project.yml` file from the file tree to open it and replace all of the existing contents with the following code below. When you're done, save the file by clicking **save**. You can also use the Command-S or Control-S shortcut from here on out. - - ```yaml - # Name your project! Project names should contain only lowercase characters - # and underscores. A good package name should reflect your organization's - # name or the intended use of these models - name: 'snowflake_dbt_python_formula1' - version: '1.3.0' - require-dbt-version: '>=1.3.0' - config-version: 2 - - # This setting configures which "profile" dbt uses for this project. - profile: 'default' - - # These configurations specify where dbt should look for different types of files. - # The `model-paths` config, for example, states that models in this project can be - # found in the "models/" directory. You probably won't need to change these! - model-paths: ["models"] - analysis-paths: ["analyses"] - test-paths: ["tests"] - seed-paths: ["seeds"] - macro-paths: ["macros"] - snapshot-paths: ["snapshots"] - - target-path: "target" # directory which will store compiled SQL files - clean-targets: # directories to be removed by `dbt clean` - - "target" - - "dbt_packages" - - models: - snowflake_dbt_python_formula1: - staging: - - +docs: - node_color: "CadetBlue" - marts: - +materialized: table - aggregates: - +docs: - node_color: "Maroon" - +tags: "bi" - - core: - +docs: - node_color: "#800080" - intermediate: - +docs: - node_color: "MediumSlateBlue" - ml: - prep: - +docs: - node_color: "Indigo" - train_predict: - +docs: - node_color: "#36454f" - - ``` - -4. The key configurations to point out in the file with relation to the work that we're going to do are in the `models` section. - - `require-dbt-version` — Tells dbt which version of dbt to use for your project. We are requiring 1.3.0 and any newer version to run python models and node colors. - - `materialized` — Tells dbt how to materialize models when compiling the code before it pushes it down to Snowflake. All models in the `marts` folder will be built as tables. - - `tags` — Applies tags at a directory level to all models. All models in the `aggregates` folder will be tagged as `bi` (abbreviation for business intelligence). - - `docs` — Specifies the `node_color` either by the plain color name or a hex value. -5. [Materializations](/docs/build/materializations) are strategies for persisting dbt models in a warehouse, with `tables` and `views` being the most commonly utilized types. By default, all dbt models are materialized as views and other materialization types can be configured in the `dbt_project.yml` file or in a model itself. It’s very important to note *Python models can only be materialized as tables or incremental models.* Since all our Python models exist under `marts`, the following portion of our `dbt_project.yml` ensures no errors will occur when we run our Python models. Starting with [dbt version 1.4](/guides/migration/versions/upgrading-to-v1.4#updates-to-python-models), Python files will automatically get materialized as tables even if not explicitly specified. - - ```yaml - marts:     - +materialized: table - ``` - diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md deleted file mode 100644 index a47a3b54d48..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: "Folder structure" -id: "7-folder-structure" -description: "Folder structure" ---- -dbt Labs has developed a [project structure guide](/guides/best-practices/how-we-structure/1-guide-overview/) that contains a number of recommendations for how to build the folder structure for your project. Do check out that guide if you want to learn more. Right now we are going to create some folders to organize our files: - -- Sources — This is our Formula 1 dataset and it will be defined in a source YAML file. -- Staging models — These models have a 1:1 with their source table. -- Intermediate — This is where we will be joining some Formula staging models. -- Marts models — Here is where we perform our major transformations. It contains these subfolders: - - aggregates - - core - - ml -1. In your file tree, use your cursor and hover over the `models` subdirectory, click the three dots **…** that appear to the right of the folder name, then select **Create Folder**. We're going to add two new folders to the file path, `staging` and `formula1` (in that order) by typing `staging/formula1` into the file path. - - - - - - If you click into your `models` directory now, you should see the new `staging` folder nested within `models` and the `formula1` folder nested within `staging`. -2. Create two additional folders the same as the last step. Within the `models` subdirectory, create new directories `marts/core`. - -3. We will need to create a few more folders and subfolders using the UI. After you create all the necessary folders, your folder tree should look like this when it's all done: - - - -Remember you can always reference the entire project in [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1) to view the complete folder and file strucutre. \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md deleted file mode 100644 index 22e49c8a30b..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md +++ /dev/null @@ -1,334 +0,0 @@ ---- -title: "Sources and staging" -id: "8-sources-and-staging" -description: "Sources and staging" ---- - -In this section, we are going to create our source and staging models. - -Sources allow us to create a dependency between our source database object and our staging models which will help us when we look at later. Also, if your source changes database or schema, you only have to update it in your `f1_sources.yml` file rather than updating all of the models it might be used in. - -Staging models are the base of our project, where we bring all the individual components we're going to use to build our more complex and useful models into the project. - -Since we want to focus on dbt and Python in this workshop, check out our [sources](/docs/build/sources) and [staging](/guides/best-practices/how-we-structure/2-staging) docs if you want to learn more (or take our [dbt Fundamentals](https://courses.getdbt.com/collections) course which covers all of our core functionality). - -## Create sources - -We're going to be using each of our 8 Formula 1 tables from our `formula1` database under the `raw`  schema for our transformations and we want to create those tables as sources in our project. - -1. Create a new file called `f1_sources.yml` with the following file path: `models/staging/formula1/f1_sources.yml`. -2. Then, paste the following code into the file before saving it: - -```yaml -version: 2 - -sources: - - name: formula1 - description: formula 1 datasets with normalized tables - database: formula1 - schema: raw - tables: - - name: circuits - description: One record per circuit, which is the specific race course. - columns: - - name: circuitid - tests: - - unique - - not_null - - name: constructors - description: One record per constructor. Constructors are the teams that build their formula 1 cars. - columns: - - name: constructorid - tests: - - unique - - not_null - - name: drivers - description: One record per driver. This table gives details about the driver. - columns: - - name: driverid - tests: - - unique - - not_null - - name: lap_times - description: One row per lap in each race. Lap times started being recorded in this dataset in 1984 and joined through driver_id. - - name: pit_stops - description: One row per pit stop. Pit stops do not have their own id column, the combination of the race_id and driver_id identify the pit stop. - columns: - - name: stop - tests: - - accepted_values: - values: [1,2,3,4,5,6,7,8] - quote: false - - name: races - description: One race per row. Importantly this table contains the race year to understand trends. - columns: - - name: raceid - tests: - - unique - - not_null - - name: results - columns: - - name: resultid - tests: - - unique - - not_null - description: One row per result. The main table that we join out for grid and position variables. - - name: status - description: One status per row. The status contextualizes whether the race was finished or what issues arose e.g. collisions, engine, etc. - columns: - - name: statusid - tests: - - unique - - not_null -``` - -## Create staging models - -The next step is to set up the staging models for each of the 8 source tables. Given the one-to-one relationship between staging models and their corresponding source tables, we'll build 8 staging models here. We know it’s a lot and in the future, we will seek to update the workshop to make this step less repetitive and more efficient. This step is also a good representation of the real world of data, where you have multiple hierarchical tables that you will need to join together! - -1. Let's go in alphabetical order to easily keep track of all our staging models! Create a new file called `stg_f1_circuits.sql` with this file path `models/staging/formula1/stg_f1_circuits.sql`. Then, paste the following code into the file before saving it: - - ```sql - with - - source as ( - - select * from {{ source('formula1','circuits') }} - - ), - - renamed as ( - select - circuitid as circuit_id, - circuitref as circuit_ref, - name as circuit_name, - location, - country, - lat as latitude, - lng as longitude, - alt as altitude - -- omit the url - from source - ) - select * from renamed - ``` - - All we're doing here is pulling the source data into the model using the `source` function, renaming some columns, and omitting the column `url` with a commented note since we don’t need it for our analysis. - -1. Create `stg_f1_constructors.sql` with this file path `models/staging/formula1/stg_f1_constructors.sql`. Paste the following code into it before saving the file: - - ```sql - with - - source as ( - - select * from {{ source('formula1','constructors') }} - - ), - - renamed as ( - select - constructorid as constructor_id, - constructorref as constructor_ref, - name as constructor_name, - nationality as constructor_nationality - -- omit the url - from source - ) - - select * from renamed - ``` - - We have 6 other stages models to create. We can do this by creating new files, then copy and paste the code into our `staging` folder. - -1. Create `stg_f1_drivers.sql` with this file path `models/staging/formula1/stg_f1_drivers.sql`: - - ```sql - with - - source as ( - - select * from {{ source('formula1','drivers') }} - - ), - - renamed as ( - select - driverid as driver_id, - driverref as driver_ref, - number as driver_number, - code as driver_code, - forename, - surname, - dob as date_of_birth, - nationality as driver_nationality - -- omit the url - from source - ) - - select * from renamed - ``` -1. Create `stg_f1_lap_times.sql` with this file path `models/staging/formula1/stg_f1_lap_times.sql`: - - ```sql - with - - source as ( - - select * from {{ source('formula1','lap_times') }} - - ), - - renamed as ( - select - raceid as race_id, - driverid as driver_id, - lap, - position, - time as lap_time_formatted, - milliseconds as lap_time_milliseconds - from source - ) - - select * from renamed - ``` -1. Create `stg_f1_pit_stops.sql` with this file path `models/staging/formula1/stg_f1_pit_stops.sql`: - - ```sql - with - - source as ( - - select * from {{ source('formula1','pit_stops') }} - - ), - - renamed as ( - select - raceid as race_id, - driverid as driver_id, - stop as stop_number, - lap, - time as lap_time_formatted, - duration as pit_stop_duration_seconds, - milliseconds as pit_stop_milliseconds - from source - ) - - select * from renamed - order by pit_stop_duration_seconds desc - ``` - -1. Create ` stg_f1_races.sql` with this file path `models/staging/formula1/stg_f1_races.sql`: - - ```sql - with - - source as ( - - select * from {{ source('formula1','races') }} - - ), - - renamed as ( - select - raceid as race_id, - year as race_year, - round as race_round, - circuitid as circuit_id, - name as circuit_name, - date as race_date, - to_time(time) as race_time, - -- omit the url - fp1_date as free_practice_1_date, - fp1_time as free_practice_1_time, - fp2_date as free_practice_2_date, - fp2_time as free_practice_2_time, - fp3_date as free_practice_3_date, - fp3_time as free_practice_3_time, - quali_date as qualifying_date, - quali_time as qualifying_time, - sprint_date, - sprint_time - from source - ) - - select * from renamed - ``` -1. Create `stg_f1_results.sql` with this file path `models/staging/formula1/stg_f1_results.sql`: - - ```sql - with - - source as ( - - select * from {{ source('formula1','results') }} - - ), - - renamed as ( - select - resultid as result_id, - raceid as race_id, - driverid as driver_id, - constructorid as constructor_id, - number as driver_number, - grid, - position::int as position, - positiontext as position_text, - positionorder as position_order, - points, - laps, - time as results_time_formatted, - milliseconds as results_milliseconds, - fastestlap as fastest_lap, - rank as results_rank, - fastestlaptime as fastest_lap_time_formatted, - fastestlapspeed::decimal(6,3) as fastest_lap_speed, - statusid as status_id - from source - ) - - select * from renamed - ``` -1. Last one! Create `stg_f1_status.sql` with this file path: `models/staging/formula1/stg_f1_status.sql`: - - ```sql - with - - source as ( - - select * from {{ source('formula1','status') }} - - ), - - renamed as ( - select - statusid as status_id, - status - from source - ) - - select * from renamed - ``` - After the source and all the staging models are complete for each of the 8 tables, your staging folder should look like this: - - - -1. It’s a good time to delete our example folder since these two models are extraneous to our formula1 pipeline and `my_first_model` fails a `not_null` test that we won’t spend time investigating. dbt Cloud will warn us that this folder will be permanently deleted, and we are okay with that so select **Delete**. - - - -1. Now that the staging models are built and saved, it's time to create the models in our development schema in Snowflake. To do this we're going to enter into the command line `dbt build` to run all of the models in our project, which includes the 8 new staging models and the existing example models. - - Your run should complete successfully and you should see green checkmarks next to all of your models in the run results. We built our 8 staging models as views and ran 13 source tests that we configured in the `f1_sources.yml` file with not that much code, pretty cool! - - - - Let's take a quick look in Snowflake, refresh database objects, open our development schema, and confirm that the new models are there. If you can see them, then we're good to go! - - - - Before we move onto the next section, be sure to commit your new models to your Git branch. Click **Commit and push** and give your commit a message like `profile, sources, and staging setup` before moving on. - - \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md deleted file mode 100644 index 262bf0e5e52..00000000000 --- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md +++ /dev/null @@ -1,299 +0,0 @@ ---- -title: "SQL transformations" -id: "9-sql-transformations" -description: "SQL transformations" ---- - -Now that we have all our sources and staging models done, it's time to move into where dbt shines — transformation! - -We need to: - -- Create some intermediate tables to join tables that aren’t hierarchical -- Create core tables for business intelligence (BI) tool ingestion -- Answer the two questions about: - - fastest pit stops - - lap time trends about our Formula 1 data by creating aggregate models using python! - -## Intermediate models - -We need to join lots of reference tables to our results table to create a human readable dataframe. What does this mean? For example, we don’t only want to have the numeric `status_id` in our table, we want to be able to read in a row of data that a driver could not finish a race due to engine failure (`status_id=5`). - -By now, we are pretty good at creating new files in the correct directories so we won’t cover this in detail. All intermediate models should be created in the path `models/intermediate`. - -1. Create a new file called `int_lap_times_years.sql`. In this model, we are joining our lap time and race information so we can look at lap times over years. In earlier Formula 1 eras, lap times were not recorded (only final results), so we filter out records where lap times are null. - - ```sql - with lap_times as ( - - select * from {{ ref('stg_f1_lap_times') }} - - ), - - races as ( - - select * from {{ ref('stg_f1_races') }} - - ), - - expanded_lap_times_by_year as ( - select - lap_times.race_id, - driver_id, - race_year, - lap, - lap_time_milliseconds - from lap_times - left join races - on lap_times.race_id = races.race_id - where lap_time_milliseconds is not null - ) - - select * from expanded_lap_times_by_year - ``` - -2. Create a file called `in_pit_stops.sql`. Pit stops are a many-to-one (M:1) relationship with our races. We are creating a feature called `total_pit_stops_per_race` by partitioning over our `race_id` and `driver_id`, while preserving individual level pit stops for rolling average in our next section. - - ```sql - with stg_f1__pit_stops as - ( - select * from {{ ref('stg_f1_pit_stops') }} - ), - - pit_stops_per_race as ( - select - race_id, - driver_id, - stop_number, - lap, - lap_time_formatted, - pit_stop_duration_seconds, - pit_stop_milliseconds, - max(stop_number) over (partition by race_id,driver_id) as total_pit_stops_per_race - from stg_f1__pit_stops - ) - - select * from pit_stops_per_race - ``` - -3. Create a file called `int_results.sql`. Here we are using 4 of our tables — `races`, `drivers`, `constructors`, and `status` — to give context to our `results` table. We are now able to calculate a new feature `drivers_age_years` by bringing the `date_of_birth` and `race_year` into the same table. We are also creating a column to indicate if the driver did not finish (dnf) the race, based upon if their `position` was null called, `dnf_flag`. - - ```sql - with results as ( - - select * from {{ ref('stg_f1_results') }} - - ), - - races as ( - - select * from {{ ref('stg_f1_races') }} - - ), - - drivers as ( - - select * from {{ ref('stg_f1_drivers') }} - - ), - - constructors as ( - - select * from {{ ref('stg_f1_constructors') }} - ), - - status as ( - - select * from {{ ref('stg_f1_status') }} - ), - - int_results as ( - select - result_id, - results.race_id, - race_year, - race_round, - circuit_id, - circuit_name, - race_date, - race_time, - results.driver_id, - results.driver_number, - forename ||' '|| surname as driver, - cast(datediff('year', date_of_birth, race_date) as int) as drivers_age_years, - driver_nationality, - results.constructor_id, - constructor_name, - constructor_nationality, - grid, - position, - position_text, - position_order, - points, - laps, - results_time_formatted, - results_milliseconds, - fastest_lap, - results_rank, - fastest_lap_time_formatted, - fastest_lap_speed, - results.status_id, - status, - case when position is null then 1 else 0 end as dnf_flag - from results - left join races - on results.race_id=races.race_id - left join drivers - on results.driver_id = drivers.driver_id - left join constructors - on results.constructor_id = constructors.constructor_id - left join status - on results.status_id = status.status_id - ) - - select * from int_results - ``` -1. Create a *Markdown* file `intermediate.md` that we will go over in depth during the [Testing](/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) and [Documentation](/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation) sections. - - ```markdown - # the intent of this .md is to allow for multi-line long form explanations for our intermediate transformations - - # below are descriptions - {% docs int_results %} In this query we want to join out other important information about the race results to have a human readable table about results, races, drivers, constructors, and status. - We will have 4 left joins onto our results table. {% enddocs %} - - {% docs int_pit_stops %} There are many pit stops within one race, aka a M:1 relationship. - We want to aggregate this so we can properly join pit stop information without creating a fanout. {% enddocs %} - - {% docs int_lap_times_years %} Lap times are done per lap. We need to join them out to the race year to understand yearly lap time trends. {% enddocs %} - ``` -1. Create a *YAML* file `intermediate.yml` that we will go over in depth during the [Testing](/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) and [Documentation](/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation) sections. - - ```yaml - version: 2 - - models: - - name: int_results - description: '{{ doc("int_results") }}' - - name: int_pit_stops - description: '{{ doc("int_pit_stops") }}' - - name: int_lap_times_years - description: '{{ doc("int_lap_times_years") }}' - ``` - That wraps up the intermediate models we need to create our core models! - -## Core models - -1. Create a file `fct_results.sql`. This is what I like to refer to as the “mega table” — a really large denormalized table with all our context added in at row level for human readability. Importantly, we have a table `circuits` that is linked through the table `races`. When we joined `races` to `results` in `int_results.sql` we allowed our tables to make the connection from `circuits` to `results` in `fct_results.sql`. We are only taking information about pit stops at the result level so our join would not cause a [fanout](https://community.looker.com/technical-tips-tricks-1021/what-is-a-fanout-23327). - - ```sql - with int_results as ( - - select * from {{ ref('int_results') }} - - ), - - int_pit_stops as ( - select - race_id, - driver_id, - max(total_pit_stops_per_race) as total_pit_stops_per_race - from {{ ref('int_pit_stops') }} - group by 1,2 - ), - - circuits as ( - - select * from {{ ref('stg_f1_circuits') }} - ), - base_results as ( - select - result_id, - int_results.race_id, - race_year, - race_round, - int_results.circuit_id, - int_results.circuit_name, - circuit_ref, - location, - country, - latitude, - longitude, - altitude, - total_pit_stops_per_race, - race_date, - race_time, - int_results.driver_id, - driver, - driver_number, - drivers_age_years, - driver_nationality, - constructor_id, - constructor_name, - constructor_nationality, - grid, - position, - position_text, - position_order, - points, - laps, - results_time_formatted, - results_milliseconds, - fastest_lap, - results_rank, - fastest_lap_time_formatted, - fastest_lap_speed, - status_id, - status, - dnf_flag - from int_results - left join circuits - on int_results.circuit_id=circuits.circuit_id - left join int_pit_stops - on int_results.driver_id=int_pit_stops.driver_id and int_results.race_id=int_pit_stops.race_id - ) - - select * from base_results - ``` - -1. Create the file `pit_stops_joined.sql`. Our results and pit stops are at different levels of dimensionality (also called grain). Simply put, we have multiple pit stops per a result. Since we are interested in understanding information at the pit stop level with information about race year and constructor, we will create a new table `pit_stops_joined.sql` where each row is per pit stop. Our new table tees up our aggregation in Python. - - ```sql - with base_results as ( - - select * from {{ ref('fct_results') }} - - ), - - pit_stops as ( - - select * from {{ ref('int_pit_stops') }} - - ), - - pit_stops_joined as ( - - select - base_results.race_id, - race_year, - base_results.driver_id, - constructor_id, - constructor_name, - stop_number, - lap, - lap_time_formatted, - pit_stop_duration_seconds, - pit_stop_milliseconds - from base_results - left join pit_stops - on base_results.race_id=pit_stops.race_id and base_results.driver_id=pit_stops.driver_id - ) - select * from pit_stops_joined - ``` - -1. Enter in the command line and execute `dbt build` to build out our entire pipeline to up to this point. Don’t worry about “overriding” your previous models – dbt workflows are designed to be idempotent so we can run them again and expect the same results. - -1. Let’s talk about our lineage so far. It’s looking good 😎. We’ve shown how SQL can be used to make data type, column name changes, and handle hierarchical joins really well; all while building out our automated lineage! - - - -1. Time to **Commit and push** our changes and give your commit a message like `intermediate and fact models` before moving on. diff --git a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md b/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md deleted file mode 100644 index f2fffd43994..00000000000 --- a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md +++ /dev/null @@ -1,660 +0,0 @@ ---- -title: "dbt Semantic Layer integration" -id: "sl-partner-integration-guide" -description: Learn about partner integration guidelines, roadmap, and connectivity. ---- - -# dbt Semantic Layer partner integration - -:::info Coming soon -The dbt Semantic Layer is undergoing some sophisticated changes, enabling more complex metric definitions and efficient querying. As part of these changes, the dbt_metrics package will be deprecated and replaced with MetricFlow. For more info, check out the [The dbt Semantic Layer: what's next?](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/) and [dbt_metrics deprecation](https://docs.getdbt.com/blog/deprecating-dbt-metrics) blog. -::: - -This guide is for dbt Semantic Layer integration partners and explains integration guidelines, and connectivity.
      - -To become a formal partner, integrate with the API, or have questions/feedback — **[contact us](mailto:semantic-layer@dbtlabs.com)** for more info. - -The dbt Semantic Layer allows users to dynamically generate and query datasets in downstream tools based on their dbt governed assets, such as metrics, models, and entities. It helps organizations manage complexities such as data, tools, and teams to make more efficient and trustworthy decisions. - -The rapid growth of different tools in the modern data stack has helped data professionals address the diverse needs of different teams. The downside of this growth is the fragmentation of business logic across teams, tools, and workloads. - -To solve this, the dbt Semantic Layer provides a platform where users can confidently leverage their data from within their tools. dbt Cloud's change management capabilities ensure that any user modifications made to core business constructs, like metrics or entities, are distributed into all the tools connected to the data platform. - -The dbt Semantic Layer can be used for a variety of tools and applications of data. Here are some common use cases - -* Business intelligence (BI), reporting, and analytics, -* Data quality and monitoring, -* Governance and privacy, -* Data discovery and cataloging, -* Machine learning and data science. - -:::info Share your use case - -If you'd like to share other use cases for the dbt Semantic Layer, contact the [dbt Labs team](mailto:semantic-layer@dbtlabs.com). - -::: - - -## Product overview - -The dbt Semantic Layer product architecture includes four primary components: - -| Components | Information | Developer plans | Team plans | Enterprise plans | License | -| --- | --- | :---: | :---: | :---: | --- | -| **[dbt Project](/docs/build/metrics)** | Define models and metrics in dbt Core. | ✅ | ✅ | ✅ | Open source in dbt Core | -| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt Core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | -| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja-like query models and metrics, use macros), compiles the query into pure SQL, executes the query in the data platform, and returns the data. | ✅

      _* Available during Public Preview only_ | ✅ | ✅ | Proprietary in dbt Cloud | -| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, and metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary in dbt Cloud | - -Review the following current architecture to understand how the components work together: - - - - -## Integration guidelines - -In collaboration with dbt Labs, partners and users can build dbt Semantic Layer integrations that can import model metadata and metric definitions, query metrics, use macros, and more. - -For more details, refer to the [Integration roadmap](#integration) and [Integration best practices](#best-practices) guidance. - -**Integration roadmap ** - -Integration partners generally build and approach their roadmap in the following stages: - -| Feature | Info | Availability | -|----------|-------|:------------:| -| **Model metadata** | Import/sync model metadata (descriptions, dimensions, test, freshness, and more) via the [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api). | ✅ | -| **Metric definitions** | Import/sync metric definitions (metric calculation, dimensions, description, and more) via the [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api). | ✅ | -| **dbt Semantic Layer as a data source** | Connect to the dbt Semantic Layer as a data source (for example, the Snowflake Proxy Server). Users can execute dbt-SQL to query metrics or models and use macros.* | ✅ | -| **Query metrics** | Query the imported metrics via a metric-centric UI (for example, a user can select a metric, time grain, and dimensions of interest). | ✅ | -| **Entity definitions** | Import/sync entity definitions (descriptions, dimensions, data types, relationships, metrics, and more) and query entities via the dbt Semantic Layer. | _*Coming soon | -| **dbt Semantic Layer Connector** | A dedicated connector with the ability to query any data platform supported in dbt Cloud. (Will replace (3).) | _*Coming soon | - -_*The coming soon features are expected to launch in 2023. - -**Integration best practices ** - -To build a successful and seamless dbt Semantic Layer integration, it should express the following: - -- **Consistent**: Have a consistent user experience (UX) incorporated into existing core user workflows. -- **Trustworthy**: Treat dbt assets (metrics, models, and entities) as first-class objects and indicate that their definitions and resulting datasets come from dbt Cloud. -- **Efficient**: Provide a clear advantage over the current approach to setting up metrics and analyses, and finding dimensions/datasets in the tool. -- **Accessible**: Include a self-serve component so a data consumer can ask questions via the user interface (UI), if applicable. - - -## Use the Discovery API - -This section will explain how to connect to and query the [Discovery API](/docs/dbt-cloud-apis/discovery-api) for model and metric definitions. - -To use the dbt Semantic Layer, you must meet the [prerequisites](/docs/use-dbt-semantic-layer/dbt-semantic-layer#prerequisites). - -
      - Discovery API authorization -
      -
      Refer to our Authorization documentation to learn how to authorize requests to the Discovery API.



      - - Metrics-specific queries work identical to existing Discovery API queries. This means existing integrations that query model metadata will work perfectly in the context of metrics. -
      -
      -
      - -
      - Query the Discovery API -
      -
      Test out the Discovery API by using the GraphQL sandbox and use this Python client as a starting point to develop. -
      -
      -
      -

      - - - -### Query models for a project - -You can query model definitions or details about a specific model for a project from a given job. - - - - - - - -This is an example of querying all models that utilize the schema`analytics` from a given job. - -``` -{ - models(jobId: 181329, schema: "analytics") { - name - status - compileCompletedAt - database - dbtVersion - runGeneratedAt - } -} -``` - - - - -``` -{ - "data": { - "models": [ - { - "name": "customers", - "status": "success", - "compileCompletedAt": "2022-12-15T06:37:24.186Z", - "database": "analytics", - "dbtVersion": "1.3.1", - "runGeneratedAt": "2022-12-15T06:37:25.187Z" - }, - { - "name": "stg_customers", - "status": "success", - "compileCompletedAt": "2022-12-15T06:37:22.509Z", - "database": "analytics", - "dbtVersion": "1.3.1", - "runGeneratedAt": "2022-12-15T06:37:25.187Z" - }, - { - "name": "stg_orders", - "status": "success", - "compileCompletedAt": "2022-12-15T06:37:22.509Z", - "database": "analytics", - "dbtVersion": "1.3.1", - "runGeneratedAt": "2022-12-15T06:37:25.187Z" - } - ] - } -} -``` - - - - -This is an example of querying details about a specific model, `model.jaffle_shop.customers`, from a given job. - -``` -{ - model(jobId: 181329, uniqueId: "model.jaffle_shop.customers") { - parentsModels { - runId - uniqueId - executionTime - } - } -} -{ - "data": { - "model": { - "parentsModels": [ - { - "runId": 105297555, - "uniqueId": "model.jaffle_shop.stg_customers", - "executionTime": 1.676571846008301 - }, - { - "runId": 105297555, - "uniqueId": "model.jaffle_shop.stg_orders", - "executionTime": 1.631831407546997 - } - ] - } - } -} -``` - - - - - - -### Query metrics for a project - -Query metrics definitions or details for a project from a given job and refer to the following resources: - -- [Metrics query](/docs/dbt-cloud-apis/discovery-schema-metrics) — Information on how to query the full list of metrics defined in a user’s project with the dbt Cloud Discovery API. -- [dbt Metrics docs](https://docs.getdbt.com/docs/build/metrics#available-properties) — Information on the available metric properties. -- [GraphQL sandbox](https://studio.apollographql.com/sandbox/explorer?endpoint=https%3A%2F%2Fmetadata.cloud.getdbt.com%2Fgraphql) — Access to test the dbt Cloud Discovery API testing environment. - - - - - - -This is an example listing metrics from a given job: - -``` -{ - metrics(jobId: 123) { - name - label - description - model - dependsOn - calculation_method - expression - timestamp - timeGrains - dimensions - window - filters - tags - meta - } -} -``` - - - - -The `metric` query supports all metric properties listed in **Listing metrics**. -The following abbreviated example is querying details about the metric `new_customers` from job `123`: - -This is an example of querying details about a specific metric `new_customers` from a given job `123`. - -``` -{ - metric(jobId: 123) { - label - calculation_method - timestamp - timeGrains - dimensions - } -} -``` - - - - - -``` -{ - "data": { - "metrics": [ - { - "uniqueId": "metric.claim_to_fame.total_claim_charges", - "name": "total_claim_charges", - "tags": [], - "label": "Total Claim Charges", - "calculation_method": "sum", - "expression": "total_charge_amount", - "timestamp": "created_at", - "timeGrains":[ - "day", - "week", - "month" - ], - "meta": {}, - "resourceType": "metric", - "model": { - "name": "fct_billed_patient_claims" - } - }, - { - "uniqueId": "metric.claim_to_fame.total_billed_diagnoses", - "name": "total_billed_diagnoses", - "tags": [], - "label": "Total Billed Diagnoses", - "calculation_method": "count_distinct", - "expression": "diagnosis_id", - "timestamp": "created_at", - "timeGrains":[ - "week", - "month", - "year" - ], - "meta": {}, - "resourceType": "metric", - "model": { - "name": "fct_billed_patient_claims" - }, - } - ] - } -} -``` - - - - - -``` -metrics: - - name: total_claim_charges - label: Total Claim Charges - model: ref('fct_billed_patient_claims') - calculation_method: sum - expression: total_charge_amount - timestamp: created_at - time_grains: [day, week, month, all_time] - - - - name: total_billed_diagnoses - label: Total Billed Diagnoses - model: ref('fct_billed_patient_claims') - calculation_method: count_distinct - expression: diagnosis_id - timestamp: created_at - time_grains: [day, week, month] -``` - - - - - - - - -## Query the dbt Semantic Layer - -This section explains how to connect to or query the dbt Semantic Layer Proxy Server to return model data, metric data, and so on. - -When you configure the dbt Semantic Layer, dbt Cloud provides a Proxy Server endpoint that users can connect to as though it's a Snowflake-hosted endpoint. Once the queries are submitted, dbt Cloud will: - -1. Compile dbt-sql queries into valid Snowflake SQL, -2. Execute the compiled SQL against the Snowflake data platform, -3. Return the results to the client. - -Replace the hostname in your existing data platform connection with the relevant dbt Cloud Proxy Server URL (for example, `abc123.proxy.cloud.getdbt.com`). All queries you submit through the endpoint will be compiled en route to the data platform.* - -*_Note: This approach will change with the new Semantic Layer connection in mid-2023, which will be able to query all data platforms supported in dbt Cloud through dedicated JDBC/ODBC drivers, and eventually an API._ - - - - - - - -Users can compile and execute metric queries using macros defined in the [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics). This package: - -- Generates the SQL required to accurately calculate the metric definition, -- Supplies helper macros for derived calculations (like month over month, year to date, and so on) time series operations - - -``` -select * -from {{ metrics.calculate( - metric_list=[metric('customers'), metric(‘revenue’)], - grain='week', - dimensions=['plan', 'country'], - secondary_calculations=[ - metrics.period_to_date(aggregate="sum", period="year"), - metrics.rolling(aggregate="average", interval=4, alias="avg_past_4wks") - ], - start_date='2020-01-01', - end_date="date_trunc('day', getdate())" -) }} -``` - - - - - -Model queries allow users to query models and use macros from their dbt project. - -``` -select cents_to_dollars('amount_cents') as amount_dollars -from {{ ref('orders') }} -``` - - - -### Entities - - -dbt Labs will introduce a new node type, **[entity](https://github.com/dbt-labs/dbt-core/issues/6379)**, when dbt Core version 1.5 launches. It introduces a new and efficient way to define metrics by reusing logic (for example, `time_grains`). - -Entities are semantic objects made up of curated dimensions from models with more metadata defined. Over time, users can standardize metric and entity definitions with packages to speed up development. - -For integrations, entities will provide information like: - -- a way to organize metrics based on the entity they reference, and -- a new consumable and dynamically generated dataset (versus finding a table in the data platform). - -This information will be available alongside the Discovery API, and entities can be directly queried through the dbt Semantic Layer. - - - -:::caution 🚧 - -Entities are a work in progress — expect continuous changes and improvements. To stay up-to-date, refer to the [entity discussions](https://github.com/dbt-labs/dbt-core/issues/6379) page. - -::: - - - - - - - -Define entities in your dbt project. - -``` -entities: ## The top-level path of the new node - - name: [Required] ## The name of the entity - model: [Required] ## The name of the model that the entity is dependent on - description: [Optional] ## The description of the entity - - dimensions: [Optional] ## The list of dimensions & properties associated with the entity. - - include: [Optional] * - - exclude: [Optional] - - name: [Required] ## The name of the dimension - column_name: [Optional] ## The name of the column in the model if not 1:1. Serves as mapping - data_type: [Optional] ## The data type of the dimension - description: [Optional] ## Description of the dimension - default_timestamp: [Optional] ## Setting datetime dimension as default for metrics - time_grains: [Optional] ## Acceptable time grains for the datetime dimension - primary_key: [Optional] ## Whether this dimension is part of the primary key -``` - - - - -Query entities via the Discovery API. - -``` -"entity.project_name.entity_name": { - "unique_id": "entity.project_name.entity_name", - "package_name": "project_name", - "original_file_path": "models/metric_definitions/ratio_metric.yml", - "name": "entity_name", - "model": "ref('model_name')", - "description": "some description", - "dimensions": { - "dimension_name": { - "name": "dimension_name", - "column_name": "column_name", - "default_timestamp": "true", - "time_grains": "[day, week, month, year]" - "primary_key": true, - "data_type": null, - "description": "TBD", - "meta": {}, - } - }, - "resource_type": "entity", - "meta": {}, - "tags": [], - "config": { - "enabled": true, - }, - "depends_on": { - "macros": [], - "nodes": [ - "model.project_name.model_name", - ] - }, - "docs": { - "show": true, - "node_color": null - }, - "refs": [ - [ - "model_name", - ] - ], - "created_at": 1669653016.522599 - }, - ``` - - - - -How to define new [metrics](/docs/build/metrics) in your dbt project. The metric definition and metadata response will change accordingly once entities are introduced, notably with metrics referencing entities instead of models and inheriting entity dimensions. - - ``` - metrics: - ## Always required - - name: [Required] ## The name of the metric - label: [Required] ## The human-readable name of the metric - calculation_method: [Required] ## The calculation/aggregation used for the metric - expression: [Required] ## The SQL expression being aggregated/calculated - entity: [Required] ## The entity being used as the source of the metric - - ## Always optional - description: [Optional] ## Any description about the metric - timestamp: [Optional] ## The name of the timestamp field to use - time_grains: [Optional] ## The list of time grains that are permitted - filters: [Optional] ## The filters of the metric - window: [Optional] ## The ability to make a metric cumulative over a time period - config: [Optional] ## Additional information for configuring the output - - ## Either or dimensions: - include: [Optional] ## The list of dimensions to be included. Either * or list - exclude: [Optional] ## The list of dimensions to be excluded from the inherited list - ``` - - - - - -``` -"metric.project_name.metric_name": { - "fqn": [ - "project_name", - "folder_name", - "metric_name" - ], - "unique_id": "metric.project_name.metric_name", - "package_name": "project_name", - "root_path": "file_path", - "path": "file_path", - "original_file_path": "file_path", - "name": "metric_name", - "description": "description", - "entity": "entity_name", - "label": "Human readable version", - "calculation_method": "the calc method", - "timestamp": "the timestamp field", - "time_grains": [ - "day", - "week" - ], - "expression": "a field name or sql expression", - "dimensions": [ - { - "entity_name": [ - "had_discount", - "order_country" - ] - } - ], - "window": null, - "resource_type": "metric", - "filters": [], - "meta": {}, - "tags": [], - "config": { - "enabled": true - }, - "unrendered_config": {}, - "sources": [], - "depends_on": { - "macros": [], - "nodes": [ - "entity.projet_name.entity_name", - ] - }, - "entities": [ - [ - "entity_name" - ] - ], - "metrics": ["used for derived metrics"], - "created_at": 1669653027.290001 - }, - ``` - - - - -Query an entity using dbt-SQL. Eventually, users will be able to query entities and dynamically generate datasets using a macro (like with metrics), without having to find specific tables or columns. - -``` -select * -from {{ entities.calculate( - entity_list=[...], [Required, one to start] - dimensions: [...], [Optional, default is all] - metrics: [...], [Optional, default is all at finest grain] - filters: ... - )}} - ``` - - - -### dbt Semantic Layer Connector - -In order to support more data platforms and enhance the user experience, users will be able to connect to a [dbt Cloud-supported data platform](/docs/cloud/connect-data-platform/about-connections) with the dbt Semantic Layer. - -Integration partners need to install the [Arrow FlightSQL](https://arrow.apache.org/docs/format/FlightSql.html) JDBC/ODBC driver, which will authenticate with dbt Cloud and the data platform that it queries. - - - - - -### dbt Semantic Layer API - -dbt Cloud will provide a web API that supports: - -- Compiling dbt-SQL queries to return their compiled SQL. -- Executing dbt-SQL queries and returning the queried results from the data platform. - -The API will be a viable integration point with the dbt Semantic Layer. It will be authorized by a [dbt Cloud service token](/docs/dbt-cloud-apis/service-tokens) and eventually support the invocation of dbt commands (e.g., `dbt run`, `dbt test`, etc.) in the future. - - -## Contact us - -### For dbt Semantic Layer support - -For partner and customer support, please email the [Support team](mailto:support@getdbt.com). Please ensure the message includes: - -- "Semantic Layer" -- The name of the partner software -- The dbt Cloud account ID of the customer, if you are a partner making the inquiry - -### For product and partnerships - -If you'd like to become a formal partner, have product feedback/questions, or are interested in integrating, email the [Product and Partnership team](mailto:semantic-layer@dbtlabs.com). - - - -## Related docs - -- [dbt Semantic Layer docs](https://docs.getdbt.com/docs/use-dbt-semantic-layer/dbt-semantic-layer) to learn about the product. -- [dbt Metrics docs](https://docs.getdbt.com/docs/building-a-dbt-project/metrics) for more information about its components. -- [dbt Semantic Layer intro blog](https://www.getdbt.com/blog/dbt-semantic-layer/) and [launch blog](https://www.getdbt.com/blog/frontiers-of-the-dbt-semantic-layer/) to learn more about the product vision and purpose. -- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations. - - diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md b/website/docs/guides/dbt-models-on-databricks.md similarity index 93% rename from website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md rename to website/docs/guides/dbt-models-on-databricks.md index b5389645258..489a3c28467 100644 --- a/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md +++ b/website/docs/guides/dbt-models-on-databricks.md @@ -1,17 +1,26 @@ --- -title: How to optimize and troubleshoot dbt models on Databricks -sidebar_label: "How to optimize and troubleshoot dbt models on Databricks" +title: Optimize and troubleshoot dbt models on Databricks +id: optimize-dbt-models-on-databricks description: "Learn more about optimizing and troubleshooting your dbt models on Databricks" +displayText: Optimizing and troubleshooting your dbt models on Databricks +hoverSnippet: Learn how to optimize and troubleshoot your dbt models on Databricks. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'databricks' +hide_table_of_contents: true +tags: ['Databricks', 'dbt Core','dbt Cloud'] +level: 'Intermediate' +recently_updated: true --- +## Introduction -Continuing our Databricks and dbt guide series from the last [guide](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project), it’s time to talk about performance optimization. In this follow-up post,  we outline simple strategies to optimize for cost, performance, and simplicity when architecting your data pipelines. We’ve encapsulated these strategies in this acronym-framework: +Building on the [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project) guide, we'd like to discuss performance optimization. In this follow-up post, we outline simple strategies to optimize for cost, performance, and simplicity when you architect data pipelines. We’ve encapsulated these strategies in this acronym-framework: - Platform Components - Patterns & Best Practices - Performance Troubleshooting -## 1. Platform Components +## Platform Components As you start to develop your dbt projects, one of the first decisions you will make is what kind of backend infrastructure to run your models against. Databricks offers SQL warehouses, All-Purpose Compute, and Jobs Compute, each optimized to workloads they are catered to. Our recommendation is to use Databricks SQL warehouses for all your SQL workloads. SQL warehouses are optimized for SQL workloads when compared to other compute options, additionally, they can scale both vertically to support larger workloads and horizontally to support concurrency. Also, SQL warehouses are easier to manage and provide out-of-the-box features such as query history to help audit and optimize your SQL workloads. Between Serverless, Pro, and Classic SQL Warehouse types that Databricks offers, our standard recommendation for you is to leverage Databricks serverless warehouses. You can explore features of these warehouse types in the [Compare features section](https://www.databricks.com/product/pricing/databricks-sql?_gl=1*2rsmlo*_ga*ZmExYzgzZDAtMWU0Ny00N2YyLWFhYzEtM2RhZTQzNTAyZjZi*_ga_PQSEQ3RZQC*MTY3OTYwMDg0Ni4zNTAuMS4xNjc5NjAyMDMzLjUzLjAuMA..&_ga=2.104593536.1471430337.1679342371-fa1c83d0-1e47-47f2-aac1-3dae43502f6b) on the Databricks pricing page. @@ -31,11 +40,11 @@ Another technique worth implementing is to provision separate SQL warehouses for Because of the ability of serverless warehouses to spin up in a matter of seconds, setting your auto-stop configuration to a lower threshold will not impact SLAs and end-user experience. From the SQL Workspace UI, the default value is 10 minutes and  you can set it to 5 minutes for a lower threshold with the UI. If you would like more custom settings, you can set the threshold to as low as 1 minute with the [API](https://docs.databricks.com/sql/api/sql-endpoints.html#). -## 2. Patterns & Best Practices +## Patterns & Best Practices Now that we have a solid sense of the infrastructure components, we can shift our focus to best practices and design patterns on pipeline development.  We recommend the staging/intermediate/mart approach which is analogous to the medallion architecture bronze/silver/gold approach that’s recommended by Databricks. Let’s dissect each stage further. -dbt has guidelines on how you can [structure your dbt project](/guides/best-practices/how-we-structure/1-guide-overview) which you can learn more about. +dbt has guidelines on how you can [structure your dbt project](/best-practices/how-we-structure/1-guide-overview) which you can learn more about. ### Bronze / Staging Layer: @@ -49,7 +58,7 @@ The main benefit of leveraging `COPY INTO` is that it's an incremental operation Now that we have our bronze table taken care of, we can proceed with the silver layer. -For cost and performance reasons, many customers opt to implement an incremental pipeline approach. The main benefit with this approach is that you process a lot less data when you insert new records into the silver layer, rather than re-create the table each time with all the data from the bronze layer. However it should be noted that by default, [dbt recommends using views and tables](/guides/best-practices/materializations/1-guide-overview) to start out with and then moving to incremental as you require more performance optimization. +For cost and performance reasons, many customers opt to implement an incremental pipeline approach. The main benefit with this approach is that you process a lot less data when you insert new records into the silver layer, rather than re-create the table each time with all the data from the bronze layer. However it should be noted that by default, [dbt recommends using views and tables](/best-practices/materializations/1-guide-overview) to start out with and then moving to incremental as you require more performance optimization. dbt has an [incremental model materialization](/reference/resource-configs/spark-configs#the-merge-strategy) to facilitate this framework. How this works at a high level is that Databricks will create a temp view with a snapshot of data and then merge that snapshot into the silver table. You can customize the time range of the snapshot to suit your specific use case by configuring the `where` conditional in your `is_incremental` logic. The most straightforward implementation is to merge data using a timestamp that’s later than the current max timestamp in the silver table, but there are certainly valid use cases for increasing the temporal range of the source snapshot. @@ -121,7 +130,7 @@ incremental_predicates = [ }} ``` -## 3. Performance Troubleshooting +## Performance Troubleshooting Performance troubleshooting refers to the process of identifying and resolving issues that impact the performance of your dbt models and overall data pipelines. By improving the speed and performance of your Lakehouse platform, you will be able to process data faster, process large and complex queries more effectively, and provide faster time to market.  Let’s go into detail the three effective strategies that you can implement. @@ -166,8 +175,8 @@ Now you might be wondering, how do you identify opportunities for performance im With the [dbt Cloud Admin API](/docs/dbt-cloud-apis/admin-cloud-api), you can  pull the dbt artifacts from your dbt Cloud run,  put the generated `manifest.json` into an S3 bucket, stage it, and model the data using the [dbt artifacts package](https://hub.getdbt.com/brooklyn-data/dbt_artifacts/latest/). That package can help you identify inefficiencies in your dbt models and pinpoint where opportunities for improvement are. -## Conclusion +### Conclusion -This concludes the second guide in our series on “Working with Databricks and dbt”, following [How to set up your Databricks and dbt Project](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project). +This builds on the content in [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project). We welcome you to try these strategies on our example open source TPC-H implementation and to provide us with thoughts/feedback as you start to incorporate these features into production. Looking forward to your feedback on [#db-databricks-and-spark](https://getdbt.slack.com/archives/CNGCW8HKL) Slack channel! diff --git a/website/docs/guides/dbt-python-snowpark.md b/website/docs/guides/dbt-python-snowpark.md new file mode 100644 index 00000000000..55e6b68c172 --- /dev/null +++ b/website/docs/guides/dbt-python-snowpark.md @@ -0,0 +1,1925 @@ +--- +title: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake" +id: "dbt-python-snowpark" +description: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake" +hoverSnippet: Learn how to leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Snowflake'] +level: 'Intermediate' +recently_updated: true +--- + +## Introduction + +The focus of this workshop will be to demonstrate how we can use both *SQL and python together* in the same workflow to run *both analytics and machine learning models* on dbt Cloud. + +All code in today’s workshop can be found on [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1). + +### What you'll use during the lab + +- A [Snowflake account](https://trial.snowflake.com/) with ACCOUNTADMIN access +- A [dbt Cloud account](https://www.getdbt.com/signup/) + +### What you'll learn + +- How to build scalable data transformation pipelines using dbt, and Snowflake using SQL and Python +- How to leverage copying data into Snowflake from a public S3 bucket + +### What you need to know + +- Basic to intermediate SQL and python. +- Basic understanding of dbt fundamentals. We recommend the [dbt Fundamentals course](https://courses.getdbt.com/collections) if you're interested. +- High level machine learning process (encoding, training, testing) +- Simple ML algorithms — we will use logistic regression to keep the focus on the *workflow*, not algorithms! + +### What you'll build + +- A set of data analytics and prediction pipelines using Formula 1 data leveraging dbt and Snowflake, making use of best practices like data quality tests and code promotion between environments +- We will create insights for: + 1. Finding the lap time average and rolling average through the years (is it generally trending up or down)? + 2. Which constructor has the fastest pit stops in 2021? + 3. Predicting the position of each driver given using a decade of data (2010 - 2020) + +As inputs, we are going to leverage Formula 1 datasets hosted on a dbt Labs public S3 bucket. We will create a Snowflake Stage for our CSV files then use Snowflake’s `COPY INTO` function to copy the data in from our CSV files into tables. The Formula 1 is available on [Kaggle](https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020). The data is originally compiled from the [Ergast Developer API](http://ergast.com/mrd/). + +Overall we are going to set up the environments, build scalable pipelines in dbt, establish data tests, and promote code to production. + +## Configure Snowflake + +1. Log in to your trial Snowflake account. You can [sign up for a Snowflake Trial Account using this form](https://signup.snowflake.com/) if you don’t have one. +2. Ensure that your account is set up using **AWS** in the **US East (N. Virginia)**. We will be copying the data from a public AWS S3 bucket hosted by dbt Labs in the us-east-1 region. By ensuring our Snowflake environment setup matches our bucket region, we avoid any multi-region data copy and retrieval latency issues. + + + +3. After creating your account and verifying it from your sign-up email, Snowflake will direct you back to the UI called Snowsight. + +4. When Snowsight first opens, your window should look like the following, with you logged in as the ACCOUNTADMIN with demo worksheets open: + + + +5. Navigate to **Admin > Billing & Terms**. Click **Enable > Acknowledge & Continue** to enable Anaconda Python Packages to run in Snowflake. + + + + + +6. Finally, create a new Worksheet by selecting **+ Worksheet** in the upper right corner. + +## Connect to data source + +We need to obtain our data source by copying our Formula 1 data into Snowflake tables from a public S3 bucket that dbt Labs hosts. + +1. When a new Snowflake account is created, there should be a preconfigured warehouse in your account named `COMPUTE_WH`. +2. If for any reason your account doesn’t have this warehouse, we can create a warehouse using the following script: + + ```sql + create or replace warehouse COMPUTE_WH with warehouse_size=XSMALL + ``` + +3. Rename the worksheet to `data setup script` since we will be placing code in this worksheet to ingest the Formula 1 data. Make sure you are still logged in as the **ACCOUNTADMIN** and select the **COMPUTE_WH** warehouse. + + + +4. Copy the following code into the main body of the Snowflake worksheet. You can also find this setup script under the `setup` folder in the [Git repository](https://github.com/dbt-labs/python-snowpark-formula1/blob/main/setup/setup_script_s3_to_snowflake.sql). The script is long since it's bring in all of the data we'll need today! + + ```sql + -- create and define our formula1 database + create or replace database formula1; + use database formula1; + create or replace schema raw; + use schema raw; + + -- define our file format for reading in the csvs + create or replace file format csvformat + type = csv + field_delimiter =',' + field_optionally_enclosed_by = '"', + skip_header=1; + + -- + create or replace stage formula1_stage + file_format = csvformat + url = 's3://formula1-dbt-cloud-python-demo/formula1-kaggle-data/'; + + -- load in the 8 tables we need for our demo + -- we are first creating the table then copying our data in from s3 + -- think of this as an empty container or shell that we are then filling + create or replace table formula1.raw.circuits ( + CIRCUITID NUMBER(38,0), + CIRCUITREF VARCHAR(16777216), + NAME VARCHAR(16777216), + LOCATION VARCHAR(16777216), + COUNTRY VARCHAR(16777216), + LAT FLOAT, + LNG FLOAT, + ALT NUMBER(38,0), + URL VARCHAR(16777216) + ); + -- copy our data from public s3 bucket into our tables + copy into circuits + from @formula1_stage/circuits.csv + on_error='continue'; + + create or replace table formula1.raw.constructors ( + CONSTRUCTORID NUMBER(38,0), + CONSTRUCTORREF VARCHAR(16777216), + NAME VARCHAR(16777216), + NATIONALITY VARCHAR(16777216), + URL VARCHAR(16777216) + ); + copy into constructors + from @formula1_stage/constructors.csv + on_error='continue'; + + create or replace table formula1.raw.drivers ( + DRIVERID NUMBER(38,0), + DRIVERREF VARCHAR(16777216), + NUMBER VARCHAR(16777216), + CODE VARCHAR(16777216), + FORENAME VARCHAR(16777216), + SURNAME VARCHAR(16777216), + DOB DATE, + NATIONALITY VARCHAR(16777216), + URL VARCHAR(16777216) + ); + copy into drivers + from @formula1_stage/drivers.csv + on_error='continue'; + + create or replace table formula1.raw.lap_times ( + RACEID NUMBER(38,0), + DRIVERID NUMBER(38,0), + LAP NUMBER(38,0), + POSITION FLOAT, + TIME VARCHAR(16777216), + MILLISECONDS NUMBER(38,0) + ); + copy into lap_times + from @formula1_stage/lap_times.csv + on_error='continue'; + + create or replace table formula1.raw.pit_stops ( + RACEID NUMBER(38,0), + DRIVERID NUMBER(38,0), + STOP NUMBER(38,0), + LAP NUMBER(38,0), + TIME VARCHAR(16777216), + DURATION VARCHAR(16777216), + MILLISECONDS NUMBER(38,0) + ); + copy into pit_stops + from @formula1_stage/pit_stops.csv + on_error='continue'; + + create or replace table formula1.raw.races ( + RACEID NUMBER(38,0), + YEAR NUMBER(38,0), + ROUND NUMBER(38,0), + CIRCUITID NUMBER(38,0), + NAME VARCHAR(16777216), + DATE DATE, + TIME VARCHAR(16777216), + URL VARCHAR(16777216), + FP1_DATE VARCHAR(16777216), + FP1_TIME VARCHAR(16777216), + FP2_DATE VARCHAR(16777216), + FP2_TIME VARCHAR(16777216), + FP3_DATE VARCHAR(16777216), + FP3_TIME VARCHAR(16777216), + QUALI_DATE VARCHAR(16777216), + QUALI_TIME VARCHAR(16777216), + SPRINT_DATE VARCHAR(16777216), + SPRINT_TIME VARCHAR(16777216) + ); + copy into races + from @formula1_stage/races.csv + on_error='continue'; + + create or replace table formula1.raw.results ( + RESULTID NUMBER(38,0), + RACEID NUMBER(38,0), + DRIVERID NUMBER(38,0), + CONSTRUCTORID NUMBER(38,0), + NUMBER NUMBER(38,0), + GRID NUMBER(38,0), + POSITION FLOAT, + POSITIONTEXT VARCHAR(16777216), + POSITIONORDER NUMBER(38,0), + POINTS NUMBER(38,0), + LAPS NUMBER(38,0), + TIME VARCHAR(16777216), + MILLISECONDS NUMBER(38,0), + FASTESTLAP NUMBER(38,0), + RANK NUMBER(38,0), + FASTESTLAPTIME VARCHAR(16777216), + FASTESTLAPSPEED FLOAT, + STATUSID NUMBER(38,0) + ); + copy into results + from @formula1_stage/results.csv + on_error='continue'; + + create or replace table formula1.raw.status ( + STATUSID NUMBER(38,0), + STATUS VARCHAR(16777216) + ); + copy into status + from @formula1_stage/status.csv + on_error='continue'; + + ``` + +5. Ensure all the commands are selected before running the query — an easy way to do this is to use Ctrl-a to highlight all of the code in the worksheet. Select **run** (blue triangle icon). Notice how the dot next to your **COMPUTE_WH** turns from gray to green as you run the query. The **status** table is the final table of all 8 tables loaded in. + + + +6. Let’s unpack that pretty long query we ran into component parts. We ran this query to load in our 8 Formula 1 tables from a public S3 bucket. To do this, we: + - Created a new database called `formula1` and a schema called `raw` to place our raw (untransformed) data into. + - Defined our file format for our CSV files. Importantly, here we use a parameter called `field_optionally_enclosed_by =` since the string columns in our Formula 1 csv files use quotes. Quotes are used around string values to avoid parsing issues where commas `,` and new lines `/n` in data values could cause data loading errors. + - Created a stage to locate our data we are going to load in. Snowflake Stages are locations where data files are stored. Stages are used to both load and unload data to and from Snowflake locations. Here we are using an external stage, by referencing an S3 bucket. + - Created our tables for our data to be copied into. These are empty tables with the column name and data type. Think of this as creating an empty container that the data will then fill into. + - Used the `copy into` statement for each of our tables. We reference our staged location we created and upon loading errors continue to load in the rest of the data. You should not have data loading errors but if you do, those rows will be skipped and Snowflake will tell you which rows caused errors + +7. Now let's take a look at some of our cool Formula 1 data we just loaded up! + 1. Create a new worksheet by selecting the **+** then **New Worksheet**. + + 2. Navigate to **Database > Formula1 > RAW > Tables**. + 3. Query the data using the following code. There are only 76 rows in the circuits table, so we don’t need to worry about limiting the amount of data we query. + + ```sql + select * from formula1.raw.circuits + ``` + + 4. Run the query. From here on out, we’ll use the keyboard shortcuts Command-Enter or Control-Enter to run queries and won’t explicitly call out this step. + 5. Review the query results, you should see information about Formula 1 circuits, starting with Albert Park in Australia! + 6. Finally, ensure you have all 8 tables starting with `CIRCUITS` and ending with `STATUS`. Now we are ready to connect into dbt Cloud! + + + +## Configure dbt Cloud + +1. We are going to be using [Snowflake Partner Connect](https://docs.snowflake.com/en/user-guide/ecosystem-partner-connect.html) to set up a dbt Cloud account. Using this method will allow you to spin up a fully fledged dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [managed repository](/docs/collaborate/git/managed-repository), environments, and credentials already established. +2. Navigate out of your worksheet back by selecting **home**. +3. In Snowsight, confirm that you are using the **ACCOUNTADMIN** role. +4. Navigate to the **Admin** **> Partner Connect**. Find **dbt** either by using the search bar or navigating the **Data Integration**. Select the **dbt** tile. + +5. You should now see a new window that says **Connect to dbt**. Select **Optional Grant** and add the `FORMULA1` database. This will grant access for your new dbt user role to the FORMULA1 database. + + +6. Ensure the `FORMULA1` is present in your optional grant before clicking **Connect**.  This will create a dedicated dbt user, database, warehouse, and role for your dbt Cloud trial. + + + +7. When you see the **Your partner account has been created** window, click **Activate**. + +8. You should be redirected to a dbt Cloud registration page. Fill out the form. Make sure to save the password somewhere for login in the future. + + + +9. Select **Complete Registration**. You should now be redirected to your dbt Cloud account, complete with a connection to your Snowflake account, a deployment and a development environment, and a sample job. + +10. To help you version control your dbt project, we have connected it to a [managed repository](/docs/collaborate/git/managed-repository), which means that dbt Labs will be hosting your repository for you. This will give you access to a Git workflow without you having to create and host the repository yourself. You will not need to know Git for this workshop; dbt Cloud will help guide you through the workflow. In the future, when you’re developing your own project, [feel free to use your own repository](/docs/cloud/git/connect-github). This will allow you to learn more about features like [Slim CI](/docs/deploy/continuous-integration) builds after this workshop. + +## Change development schema name navigate the IDE + +1. First we are going to change the name of our default schema to where our dbt models will build. By default, the name is `dbt_`. We will change this to `dbt_` to create your own personal development schema. To do this, select **Profile Settings** from the gear icon in the upper right. + + + +2. Navigate to the **Credentials** menu and select **Partner Connect Trial**, which will expand the credentials menu. + + + +3. Click **Edit** and change the name of your schema from `dbt_` to `dbt_YOUR_NAME` replacing `YOUR_NAME` with your initials and name (`hwatson` is used in the lab screenshots). Be sure to click **Save** for your changes! + + +4. We now have our own personal development schema, amazing! When we run our first dbt models they will build into this schema. +5. Let’s open up dbt Cloud’s Integrated Development Environment (IDE) and familiarize ourselves. Choose **Develop** at the top of the UI. + +6. When the IDE is done loading, click **Initialize dbt project**. The initialization process creates a collection of files and folders necessary to run your dbt project. + + +7. After the initialization is finished, you can view the files and folders in the file tree menu. As we move through the workshop we'll be sure to touch on a few key files and folders that we'll work with to build out our project. +8. Next click **Commit and push** to commit the new files and folders from the initialize step. We always want our commit messages to be relevant to the work we're committing, so be sure to provide a message like `initialize project` and select **Commit Changes**. + + + + + +9. [Committing](https://www.atlassian.com/git/tutorials/saving-changes/git-commit) your work here will save it to the managed git repository that was created during the Partner Connect signup. This initial commit is the only commit that will be made directly to our `main` branch and from *here on out we'll be doing all of our work on a development branch*. This allows us to keep our development work separate from our production code. +10. There are a couple of key features to point out about the IDE before we get to work. It is a text editor, an SQL and Python runner, and a CLI with Git version control all baked into one package! This allows you to focus on editing your SQL and Python files, previewing the results with the SQL runner (it even runs Jinja!), and building models at the command line without having to move between different applications. The Git workflow in dbt Cloud allows both Git beginners and experts alike to be able to easily version control all of their work with a couple clicks. + + + +11. Let's run our first dbt models! Two example models are included in your dbt project in the `models/examples` folder that we can use to illustrate how to run dbt at the command line. Type `dbt run` into the command line and click **Enter** on your keyboard. When the run bar expands you'll be able to see the results of the run, where you should see the run complete successfully. + + + +12. The run results allow you to see the code that dbt compiles and sends to Snowflake for execution. To view the logs for this run, select one of the model tabs using the  **>** icon and then **Details**. If you scroll down a bit you'll be able to see the compiled code and how dbt interacts with Snowflake. Given that this run took place in our development environment, the models were created in your development schema. + + + +13. Now let's switch over to Snowflake to confirm that the objects were actually created. Click on the three dots **…** above your database objects and then **Refresh**. Expand the **PC_DBT_DB** database and you should see your development schema. Select the schema, then **Tables**  and **Views**. Now you should be able to see `MY_FIRST_DBT_MODEL` as a table and `MY_SECOND_DBT_MODEL` as a view. + + +## Create branch and set up project configs + +In this step, we’ll need to create a development branch and set up project level configurations. + +1. To get started with development for our project, we'll need to create a new Git branch for our work. Select **create branch** and name your development branch. We'll call our branch `snowpark_python_workshop` then click **Submit**. +2. The first piece of development we'll do on the project is to update the `dbt_project.yml` file. Every dbt project requires a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. The [dbt_project.yml](/reference/dbt_project.yml) file also contains important information that tells dbt how to operate on your project. +3. Select the `dbt_project.yml` file from the file tree to open it and replace all of the existing contents with the following code below. When you're done, save the file by clicking **save**. You can also use the Command-S or Control-S shortcut from here on out. + + ```yaml + # Name your project! Project names should contain only lowercase characters + # and underscores. A good package name should reflect your organization's + # name or the intended use of these models + name: 'snowflake_dbt_python_formula1' + version: '1.3.0' + require-dbt-version: '>=1.3.0' + config-version: 2 + + # This setting configures which "profile" dbt uses for this project. + profile: 'default' + + # These configurations specify where dbt should look for different types of files. + # The `model-paths` config, for example, states that models in this project can be + # found in the "models/" directory. You probably won't need to change these! + model-paths: ["models"] + analysis-paths: ["analyses"] + test-paths: ["tests"] + seed-paths: ["seeds"] + macro-paths: ["macros"] + snapshot-paths: ["snapshots"] + + target-path: "target" # directory which will store compiled SQL files + clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + + models: + snowflake_dbt_python_formula1: + staging: + + +docs: + node_color: "CadetBlue" + marts: + +materialized: table + aggregates: + +docs: + node_color: "Maroon" + +tags: "bi" + + core: + +docs: + node_color: "#800080" + intermediate: + +docs: + node_color: "MediumSlateBlue" + ml: + prep: + +docs: + node_color: "Indigo" + train_predict: + +docs: + node_color: "#36454f" + + ``` + +4. The key configurations to point out in the file with relation to the work that we're going to do are in the `models` section. + - `require-dbt-version` — Tells dbt which version of dbt to use for your project. We are requiring 1.3.0 and any newer version to run python models and node colors. + - `materialized` — Tells dbt how to materialize models when compiling the code before it pushes it down to Snowflake. All models in the `marts` folder will be built as tables. + - `tags` — Applies tags at a directory level to all models. All models in the `aggregates` folder will be tagged as `bi` (abbreviation for business intelligence). + - `docs` — Specifies the `node_color` either by the plain color name or a hex value. +5. [Materializations](/docs/build/materializations) are strategies for persisting dbt models in a warehouse, with `tables` and `views` being the most commonly utilized types. By default, all dbt models are materialized as views and other materialization types can be configured in the `dbt_project.yml` file or in a model itself. It’s very important to note *Python models can only be materialized as tables or incremental models.* Since all our Python models exist under `marts`, the following portion of our `dbt_project.yml` ensures no errors will occur when we run our Python models. Starting with [dbt version 1.4](/docs/dbt-versions/core-upgrade/upgrading-to-v1.4#updates-to-python-models), Python files will automatically get materialized as tables even if not explicitly specified. + + ```yaml + marts:     + +materialized: table + ``` + +## Create folders and organize files + +dbt Labs has developed a [project structure guide](/best-practices/how-we-structure/1-guide-overview/) that contains a number of recommendations for how to build the folder structure for your project. Do check out that guide if you want to learn more. Right now we are going to create some folders to organize our files: + +- Sources — This is our Formula 1 dataset and it will be defined in a source YAML file. +- Staging models — These models have a 1:1 with their source table. +- Intermediate — This is where we will be joining some Formula staging models. +- Marts models — Here is where we perform our major transformations. It contains these subfolders: + - aggregates + - core + - ml + +1. In your file tree, use your cursor and hover over the `models` subdirectory, click the three dots **…** that appear to the right of the folder name, then select **Create Folder**. We're going to add two new folders to the file path, `staging` and `formula1` (in that order) by typing `staging/formula1` into the file path. + + + + + - If you click into your `models` directory now, you should see the new `staging` folder nested within `models` and the `formula1` folder nested within `staging`. +2. Create two additional folders the same as the last step. Within the `models` subdirectory, create new directories `marts/core`. + +3. We will need to create a few more folders and subfolders using the UI. After you create all the necessary folders, your folder tree should look like this when it's all done: + + + +Remember you can always reference the entire project in [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1) to view the complete folder and file strucutre. + +## Create source and staging models + +In this section, we are going to create our source and staging models. + +Sources allow us to create a dependency between our source database object and our staging models which will help us when we look at later. Also, if your source changes database or schema, you only have to update it in your `f1_sources.yml` file rather than updating all of the models it might be used in. + +Staging models are the base of our project, where we bring all the individual components we're going to use to build our more complex and useful models into the project. + +Since we want to focus on dbt and Python in this workshop, check out our [sources](/docs/build/sources) and [staging](/best-practices/how-we-structure/2-staging) docs if you want to learn more (or take our [dbt Fundamentals](https://courses.getdbt.com/collections) course which covers all of our core functionality). + +### 1. Create sources + +We're going to be using each of our 8 Formula 1 tables from our `formula1` database under the `raw`  schema for our transformations and we want to create those tables as sources in our project. + +1. Create a new file called `f1_sources.yml` with the following file path: `models/staging/formula1/f1_sources.yml`. +2. Then, paste the following code into the file before saving it: + +```yaml +version: 2 + +sources: + - name: formula1 + description: formula 1 datasets with normalized tables + database: formula1 + schema: raw + tables: + - name: circuits + description: One record per circuit, which is the specific race course. + columns: + - name: circuitid + tests: + - unique + - not_null + - name: constructors + description: One record per constructor. Constructors are the teams that build their formula 1 cars. + columns: + - name: constructorid + tests: + - unique + - not_null + - name: drivers + description: One record per driver. This table gives details about the driver. + columns: + - name: driverid + tests: + - unique + - not_null + - name: lap_times + description: One row per lap in each race. Lap times started being recorded in this dataset in 1984 and joined through driver_id. + - name: pit_stops + description: One row per pit stop. Pit stops do not have their own id column, the combination of the race_id and driver_id identify the pit stop. + columns: + - name: stop + tests: + - accepted_values: + values: [1,2,3,4,5,6,7,8] + quote: false + - name: races + description: One race per row. Importantly this table contains the race year to understand trends. + columns: + - name: raceid + tests: + - unique + - not_null + - name: results + columns: + - name: resultid + tests: + - unique + - not_null + description: One row per result. The main table that we join out for grid and position variables. + - name: status + description: One status per row. The status contextualizes whether the race was finished or what issues arose e.g. collisions, engine, etc. + columns: + - name: statusid + tests: + - unique + - not_null +``` + +### 2. Create staging models + +The next step is to set up the staging models for each of the 8 source tables. Given the one-to-one relationship between staging models and their corresponding source tables, we'll build 8 staging models here. We know it’s a lot and in the future, we will seek to update the workshop to make this step less repetitive and more efficient. This step is also a good representation of the real world of data, where you have multiple hierarchical tables that you will need to join together! + +1. Let's go in alphabetical order to easily keep track of all our staging models! Create a new file called `stg_f1_circuits.sql` with this file path `models/staging/formula1/stg_f1_circuits.sql`. Then, paste the following code into the file before saving it: + + ```sql + with + + source as ( + + select * from {{ source('formula1','circuits') }} + + ), + + renamed as ( + select + circuitid as circuit_id, + circuitref as circuit_ref, + name as circuit_name, + location, + country, + lat as latitude, + lng as longitude, + alt as altitude + -- omit the url + from source + ) + select * from renamed + ``` + + All we're doing here is pulling the source data into the model using the `source` function, renaming some columns, and omitting the column `url` with a commented note since we don’t need it for our analysis. + +1. Create `stg_f1_constructors.sql` with this file path `models/staging/formula1/stg_f1_constructors.sql`. Paste the following code into it before saving the file: + + ```sql + with + + source as ( + + select * from {{ source('formula1','constructors') }} + + ), + + renamed as ( + select + constructorid as constructor_id, + constructorref as constructor_ref, + name as constructor_name, + nationality as constructor_nationality + -- omit the url + from source + ) + + select * from renamed + ``` + + We have 6 other stages models to create. We can do this by creating new files, then copy and paste the code into our `staging` folder. + +1. Create `stg_f1_drivers.sql` with this file path `models/staging/formula1/stg_f1_drivers.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','drivers') }} + + ), + + renamed as ( + select + driverid as driver_id, + driverref as driver_ref, + number as driver_number, + code as driver_code, + forename, + surname, + dob as date_of_birth, + nationality as driver_nationality + -- omit the url + from source + ) + + select * from renamed + ``` + +1. Create `stg_f1_lap_times.sql` with this file path `models/staging/formula1/stg_f1_lap_times.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','lap_times') }} + + ), + + renamed as ( + select + raceid as race_id, + driverid as driver_id, + lap, + position, + time as lap_time_formatted, + milliseconds as lap_time_milliseconds + from source + ) + + select * from renamed + ``` + +1. Create `stg_f1_pit_stops.sql` with this file path `models/staging/formula1/stg_f1_pit_stops.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','pit_stops') }} + + ), + + renamed as ( + select + raceid as race_id, + driverid as driver_id, + stop as stop_number, + lap, + time as lap_time_formatted, + duration as pit_stop_duration_seconds, + milliseconds as pit_stop_milliseconds + from source + ) + + select * from renamed + order by pit_stop_duration_seconds desc + ``` + +1. Create `stg_f1_races.sql` with this file path `models/staging/formula1/stg_f1_races.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','races') }} + + ), + + renamed as ( + select + raceid as race_id, + year as race_year, + round as race_round, + circuitid as circuit_id, + name as circuit_name, + date as race_date, + to_time(time) as race_time, + -- omit the url + fp1_date as free_practice_1_date, + fp1_time as free_practice_1_time, + fp2_date as free_practice_2_date, + fp2_time as free_practice_2_time, + fp3_date as free_practice_3_date, + fp3_time as free_practice_3_time, + quali_date as qualifying_date, + quali_time as qualifying_time, + sprint_date, + sprint_time + from source + ) + + select * from renamed + ``` + +1. Create `stg_f1_results.sql` with this file path `models/staging/formula1/stg_f1_results.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','results') }} + + ), + + renamed as ( + select + resultid as result_id, + raceid as race_id, + driverid as driver_id, + constructorid as constructor_id, + number as driver_number, + grid, + position::int as position, + positiontext as position_text, + positionorder as position_order, + points, + laps, + time as results_time_formatted, + milliseconds as results_milliseconds, + fastestlap as fastest_lap, + rank as results_rank, + fastestlaptime as fastest_lap_time_formatted, + fastestlapspeed::decimal(6,3) as fastest_lap_speed, + statusid as status_id + from source + ) + + select * from renamed + ``` + +1. Last one! Create `stg_f1_status.sql` with this file path: `models/staging/formula1/stg_f1_status.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','status') }} + + ), + + renamed as ( + select + statusid as status_id, + status + from source + ) + + select * from renamed + ``` + + After the source and all the staging models are complete for each of the 8 tables, your staging folder should look like this: + + + +1. It’s a good time to delete our example folder since these two models are extraneous to our formula1 pipeline and `my_first_model` fails a `not_null` test that we won’t spend time investigating. dbt Cloud will warn us that this folder will be permanently deleted, and we are okay with that so select **Delete**. + + + +1. Now that the staging models are built and saved, it's time to create the models in our development schema in Snowflake. To do this we're going to enter into the command line `dbt build` to run all of the models in our project, which includes the 8 new staging models and the existing example models. + + Your run should complete successfully and you should see green checkmarks next to all of your models in the run results. We built our 8 staging models as views and ran 13 source tests that we configured in the `f1_sources.yml` file with not that much code, pretty cool! + + + + Let's take a quick look in Snowflake, refresh database objects, open our development schema, and confirm that the new models are there. If you can see them, then we're good to go! + + + + Before we move onto the next section, be sure to commit your new models to your Git branch. Click **Commit and push** and give your commit a message like `profile, sources, and staging setup` before moving on. + +## Transform SQL + +Now that we have all our sources and staging models done, it's time to move into where dbt shines — transformation! + +We need to: + +- Create some intermediate tables to join tables that aren’t hierarchical +- Create core tables for business intelligence (BI) tool ingestion +- Answer the two questions about: + - fastest pit stops + - lap time trends about our Formula 1 data by creating aggregate models using python! + +### Intermediate models + +We need to join lots of reference tables to our results table to create a human readable dataframe. What does this mean? For example, we don’t only want to have the numeric `status_id` in our table, we want to be able to read in a row of data that a driver could not finish a race due to engine failure (`status_id=5`). + +By now, we are pretty good at creating new files in the correct directories so we won’t cover this in detail. All intermediate models should be created in the path `models/intermediate`. + +1. Create a new file called `int_lap_times_years.sql`. In this model, we are joining our lap time and race information so we can look at lap times over years. In earlier Formula 1 eras, lap times were not recorded (only final results), so we filter out records where lap times are null. + + ```sql + with lap_times as ( + + select * from {{ ref('stg_f1_lap_times') }} + + ), + + races as ( + + select * from {{ ref('stg_f1_races') }} + + ), + + expanded_lap_times_by_year as ( + select + lap_times.race_id, + driver_id, + race_year, + lap, + lap_time_milliseconds + from lap_times + left join races + on lap_times.race_id = races.race_id + where lap_time_milliseconds is not null + ) + + select * from expanded_lap_times_by_year + ``` + +2. Create a file called `in_pit_stops.sql`. Pit stops are a many-to-one (M:1) relationship with our races. We are creating a feature called `total_pit_stops_per_race` by partitioning over our `race_id` and `driver_id`, while preserving individual level pit stops for rolling average in our next section. + + ```sql + with stg_f1__pit_stops as + ( + select * from {{ ref('stg_f1_pit_stops') }} + ), + + pit_stops_per_race as ( + select + race_id, + driver_id, + stop_number, + lap, + lap_time_formatted, + pit_stop_duration_seconds, + pit_stop_milliseconds, + max(stop_number) over (partition by race_id,driver_id) as total_pit_stops_per_race + from stg_f1__pit_stops + ) + + select * from pit_stops_per_race + ``` + +3. Create a file called `int_results.sql`. Here we are using 4 of our tables — `races`, `drivers`, `constructors`, and `status` — to give context to our `results` table. We are now able to calculate a new feature `drivers_age_years` by bringing the `date_of_birth` and `race_year` into the same table. We are also creating a column to indicate if the driver did not finish (dnf) the race, based upon if their `position` was null called, `dnf_flag`. + + ```sql + with results as ( + + select * from {{ ref('stg_f1_results') }} + + ), + + races as ( + + select * from {{ ref('stg_f1_races') }} + + ), + + drivers as ( + + select * from {{ ref('stg_f1_drivers') }} + + ), + + constructors as ( + + select * from {{ ref('stg_f1_constructors') }} + ), + + status as ( + + select * from {{ ref('stg_f1_status') }} + ), + + int_results as ( + select + result_id, + results.race_id, + race_year, + race_round, + circuit_id, + circuit_name, + race_date, + race_time, + results.driver_id, + results.driver_number, + forename ||' '|| surname as driver, + cast(datediff('year', date_of_birth, race_date) as int) as drivers_age_years, + driver_nationality, + results.constructor_id, + constructor_name, + constructor_nationality, + grid, + position, + position_text, + position_order, + points, + laps, + results_time_formatted, + results_milliseconds, + fastest_lap, + results_rank, + fastest_lap_time_formatted, + fastest_lap_speed, + results.status_id, + status, + case when position is null then 1 else 0 end as dnf_flag + from results + left join races + on results.race_id=races.race_id + left join drivers + on results.driver_id = drivers.driver_id + left join constructors + on results.constructor_id = constructors.constructor_id + left join status + on results.status_id = status.status_id + ) + + select * from int_results + ``` + +1. Create a *Markdown* file `intermediate.md` that we will go over in depth in the Test and Documentation sections of the [Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake](/guides/dbt-python-snowpark) guide. + + ```markdown + # the intent of this .md is to allow for multi-line long form explanations for our intermediate transformations + + # below are descriptions + {% docs int_results %} In this query we want to join out other important information about the race results to have a human readable table about results, races, drivers, constructors, and status. + We will have 4 left joins onto our results table. {% enddocs %} + + {% docs int_pit_stops %} There are many pit stops within one race, aka a M:1 relationship. + We want to aggregate this so we can properly join pit stop information without creating a fanout. {% enddocs %} + + {% docs int_lap_times_years %} Lap times are done per lap. We need to join them out to the race year to understand yearly lap time trends. {% enddocs %} + ``` + +1. Create a *YAML* file `intermediate.yml` that we will go over in depth during the Test and Document sections of the [Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake](/guides/dbt-python-snowpark) guide. + + ```yaml + version: 2 + + models: + - name: int_results + description: '{{ doc("int_results") }}' + - name: int_pit_stops + description: '{{ doc("int_pit_stops") }}' + - name: int_lap_times_years + description: '{{ doc("int_lap_times_years") }}' + ``` + + That wraps up the intermediate models we need to create our core models! + +### Core models + +1. Create a file `fct_results.sql`. This is what I like to refer to as the “mega table” — a really large denormalized table with all our context added in at row level for human readability. Importantly, we have a table `circuits` that is linked through the table `races`. When we joined `races` to `results` in `int_results.sql` we allowed our tables to make the connection from `circuits` to `results` in `fct_results.sql`. We are only taking information about pit stops at the result level so our join would not cause a [fanout](https://community.looker.com/technical-tips-tricks-1021/what-is-a-fanout-23327). + + ```sql + with int_results as ( + + select * from {{ ref('int_results') }} + + ), + + int_pit_stops as ( + select + race_id, + driver_id, + max(total_pit_stops_per_race) as total_pit_stops_per_race + from {{ ref('int_pit_stops') }} + group by 1,2 + ), + + circuits as ( + + select * from {{ ref('stg_f1_circuits') }} + ), + base_results as ( + select + result_id, + int_results.race_id, + race_year, + race_round, + int_results.circuit_id, + int_results.circuit_name, + circuit_ref, + location, + country, + latitude, + longitude, + altitude, + total_pit_stops_per_race, + race_date, + race_time, + int_results.driver_id, + driver, + driver_number, + drivers_age_years, + driver_nationality, + constructor_id, + constructor_name, + constructor_nationality, + grid, + position, + position_text, + position_order, + points, + laps, + results_time_formatted, + results_milliseconds, + fastest_lap, + results_rank, + fastest_lap_time_formatted, + fastest_lap_speed, + status_id, + status, + dnf_flag + from int_results + left join circuits + on int_results.circuit_id=circuits.circuit_id + left join int_pit_stops + on int_results.driver_id=int_pit_stops.driver_id and int_results.race_id=int_pit_stops.race_id + ) + + select * from base_results + ``` + +1. Create the file `pit_stops_joined.sql`. Our results and pit stops are at different levels of dimensionality (also called grain). Simply put, we have multiple pit stops per a result. Since we are interested in understanding information at the pit stop level with information about race year and constructor, we will create a new table `pit_stops_joined.sql` where each row is per pit stop. Our new table tees up our aggregation in Python. + + ```sql + with base_results as ( + + select * from {{ ref('fct_results') }} + + ), + + pit_stops as ( + + select * from {{ ref('int_pit_stops') }} + + ), + + pit_stops_joined as ( + + select + base_results.race_id, + race_year, + base_results.driver_id, + constructor_id, + constructor_name, + stop_number, + lap, + lap_time_formatted, + pit_stop_duration_seconds, + pit_stop_milliseconds + from base_results + left join pit_stops + on base_results.race_id=pit_stops.race_id and base_results.driver_id=pit_stops.driver_id + ) + select * from pit_stops_joined + ``` + +1. Enter in the command line and execute `dbt build` to build out our entire pipeline to up to this point. Don’t worry about “overriding” your previous models – dbt workflows are designed to be idempotent so we can run them again and expect the same results. + +1. Let’s talk about our lineage so far. It’s looking good 😎. We’ve shown how SQL can be used to make data type, column name changes, and handle hierarchical joins really well; all while building out our automated lineage! + + + +1. Time to **Commit and push** our changes and give your commit a message like `intermediate and fact models` before moving on. + +## Running dbt Python models + +Up until now, SQL has been driving the project (car pun intended) for data cleaning and hierarchical joining. Now it’s time for Python to take the wheel (car pun still intended) for the rest of our lab! For more information about running Python models on dbt, check out our [docs](/docs/build/python-models). To learn more about dbt python works under the hood, check out [Snowpark for Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html), which makes running dbt Python models possible. + +There are quite a few differences between SQL and Python in terms of the dbt syntax and DDL, so we’ll be breaking our code and model runs down further for our python models. + +### Pit stop analysis + +First, we want to find out: which constructor had the fastest pit stops in 2021? (constructor is a Formula 1 team that builds or “constructs” the car). + +1. Create a new file called `fastest_pit_stops_by_constructor.py` in our `aggregates` (this is the first time we are using the `.py` extension!). +2. Copy the following code into the file: + + ```python + import numpy as np + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas","numpy"]) + + # get upstream data + pit_stops_joined = dbt.ref("pit_stops_joined").to_pandas() + + # provide year so we do not hardcode dates + year=2021 + + # describe the data + pit_stops_joined["PIT_STOP_SECONDS"] = pit_stops_joined["PIT_STOP_MILLISECONDS"]/1000 + fastest_pit_stops = pit_stops_joined[(pit_stops_joined["RACE_YEAR"]==year)].groupby(by="CONSTRUCTOR_NAME")["PIT_STOP_SECONDS"].describe().sort_values(by='mean') + fastest_pit_stops.reset_index(inplace=True) + fastest_pit_stops.columns = fastest_pit_stops.columns.str.upper() + + return fastest_pit_stops.round(2) + ``` + +3. Let’s break down what this code is doing step by step: + - First, we are importing the Python libraries that we are using. A *library* is a reusable chunk of code that someone else wrote that you may want to include in your programs/projects. We are using `numpy` and `pandas`in this Python model. This is similar to a dbt *package*, but our Python libraries do *not* persist across the entire project. + - Defining a function called `model` with the parameter `dbt` and `session`. The parameter `dbt` is a class compiled by dbt, which enables you to run your Python code in the context of your dbt project and DAG. The parameter `session` is a class representing your Snowflake’s connection to the Python backend. The `model` function *must return a single DataFrame*. You can see that all the data transformation happening is within the body of the `model` function that the `return` statement is tied to. + - Then, within the context of our dbt model library, we are passing in a configuration of which packages we need using `dbt.config(packages=["pandas","numpy"])`. + - Use the `.ref()` function to retrieve the data frame `pit_stops_joined` that we created in our last step using SQL. We cast this to a pandas dataframe (by default it's a Snowpark Dataframe). + - Create a variable named `year` so we aren’t passing a hardcoded value. + - Generate a new column called `PIT_STOP_SECONDS` by dividing the value of `PIT_STOP_MILLISECONDS` by 1000. + - Create our final data frame `fastest_pit_stops` that holds the records where year is equal to our year variable (2021 in this case), then group the data frame by `CONSTRUCTOR_NAME` and use the `describe()` and `sort_values()` and in descending order. This will make our first row in the new aggregated data frame the team with the fastest pit stops over an entire competition year. + - Finally, it resets the index of the `fastest_pit_stops` data frame. The `reset_index()` method allows you to reset the index back to the default 0, 1, 2, etc indexes. By default, this method will keep the "old" indexes in a column named "index"; to avoid this, use the drop parameter. Think of this as keeping your data “flat and square” as opposed to “tiered”. If you are new to Python, now might be a good time to [learn about indexes for 5 minutes](https://towardsdatascience.com/the-basics-of-indexing-and-slicing-python-lists-2d12c90a94cf) since it's the foundation of how Python retrieves, slices, and dices data. The `inplace` argument means we override the existing data frame permanently. Not to fear! This is what we want to do to avoid dealing with multi-indexed dataframes! + - Convert our Python column names to all uppercase using `.upper()`, so Snowflake recognizes them. + - Finally we are returning our dataframe with 2 decimal places for all the columns using the `round()` method. +4. Zooming out a bit, what are we doing differently here in Python from our typical SQL code: + - Method chaining is a technique in which multiple methods are called on an object in a single statement, with each method call modifying the result of the previous one. The methods are called in a chain, with the output of one method being used as the input for the next one. The technique is used to simplify the code and make it more readable by eliminating the need for intermediate variables to store the intermediate results. + - The way you see method chaining in Python is the syntax `.().()`. For example, `.describe().sort_values(by='mean')` where the `.describe()` method is chained to `.sort_values()`. + - The `.describe()` method is used to generate various summary statistics of the dataset. It's used on pandas dataframe. It gives a quick and easy way to get the summary statistics of your dataset without writing multiple lines of code. + - The `.sort_values()` method is used to sort a pandas dataframe or a series by one or multiple columns. The method sorts the data by the specified column(s) in ascending or descending order. It is the pandas equivalent to `order by` in SQL. + + We won’t go as in depth for our subsequent scripts, but will continue to explain at a high level what new libraries, functions, and methods are doing. + +5. Build the model using the UI which will **execute**: + + ```bash + dbt run --select fastest_pit_stops_by_constructor + ``` + + in the command bar. + + Let’s look at some details of our first Python model to see what our model executed. There two major differences we can see while running a Python model compared to an SQL model: + + - Our Python model was executed as a stored procedure. Snowflake needs a way to know that it's meant to execute this code in a Python runtime, instead of interpreting in a SQL runtime. We do this by creating a Python stored proc, called by a SQL command. + - The `snowflake-snowpark-python` library has been picked up to execute our Python code. Even though this wasn’t explicitly stated this is picked up by the dbt class object because we need our Snowpark package to run Python! + + Python models take a bit longer to run than SQL models, however we could always speed this up by using [Snowpark-optimized Warehouses](https://docs.snowflake.com/en/user-guide/warehouses-snowpark-optimized.html) if we wanted to. Our data is sufficiently small, so we won’t worry about creating a separate warehouse for Python versus SQL files today. + + + The rest of our **Details** output gives us information about how dbt and Snowpark for Python are working together to define class objects and apply a specific set of methods to run our models. + + So which constructor had the fastest pit stops in 2021? Let’s look at our data to find out! + +6. We can't preview Python models directly, so let’s create a new file using the **+** button or the Control-n shortcut to create a new scratchpad. +7. Reference our Python model: + + ```sql + select * from {{ ref('fastest_pit_stops_by_constructor') }} + ``` + + and preview the output: + + + Not only did Red Bull have the fastest average pit stops by nearly 40 seconds, they also had the smallest standard deviation, meaning they are both fastest and most consistent teams in pit stops. By using the `.describe()` method we were able to avoid verbose SQL requiring us to create a line of code per column and repetitively use the `PERCENTILE_COUNT()` function. + + Now we want to find the lap time average and rolling average through the years (is it generally trending up or down)? + +8. Create a new file called `lap_times_moving_avg.py` in our `aggregates` folder. +9. Copy the following code into the file: + + ```python + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas"]) + + # get upstream data + lap_times = dbt.ref("int_lap_times_years").to_pandas() + + # describe the data + lap_times["LAP_TIME_SECONDS"] = lap_times["LAP_TIME_MILLISECONDS"]/1000 + lap_time_trends = lap_times.groupby(by="RACE_YEAR")["LAP_TIME_SECONDS"].mean().to_frame() + lap_time_trends.reset_index(inplace=True) + lap_time_trends["LAP_MOVING_AVG_5_YEARS"] = lap_time_trends["LAP_TIME_SECONDS"].rolling(5).mean() + lap_time_trends.columns = lap_time_trends.columns.str.upper() + + return lap_time_trends.round(1) + ``` + +10. Breaking down our code a bit: + - We’re only using the `pandas` library for this model and casting it to a pandas data frame `.to_pandas()`. + - Generate a new column called `LAP_TIMES_SECONDS` by dividing the value of `LAP_TIME_MILLISECONDS` by 1000. + - Create the final dataframe. Get the lap time per year. Calculate the mean series and convert to a data frame. + - Reset the index. + - Calculate the rolling 5 year mean. + - Round our numeric columns to one decimal place. +11. Now, run this model by using the UI **Run model** or + + ```bash + dbt run --select lap_times_moving_avg + ``` + + in the command bar. + +12. Once again previewing the output of our data using the same steps for our `fastest_pit_stops_by_constructor` model. + + + We can see that it looks like lap times are getting consistently faster over time. Then in 2010 we see an increase occur! Using outside subject matter context, we know that significant rule changes were introduced to Formula 1 in 2010 and 2011 causing slower lap times. + +13. Now is a good time to checkpoint and commit our work to Git. Click **Commit and push** and give your commit a message like `aggregate python models` before moving on. + +### The dbt model, .source(), .ref() and .config() functions + +Let’s take a step back before starting machine learning to both review and go more in-depth at the methods that make running dbt python models possible. If you want to know more outside of this lab’s explanation read the documentation [here](/docs/build/python-models?version=1.3). + +- dbt model(dbt, session). For starters, each Python model lives in a .py file in your models/ folder. It defines a function named `model()`, which takes two parameters: + - dbt — A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG. + - session — A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames and to write DataFrames back to tables. In PySpark, by convention, the SparkSession is named spark, and available globally. For consistency across platforms, we always pass it into the model function as an explicit argument called session. +- The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame. +- `.source()` and `.ref()` functions. Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. If you want to read directly from a raw source table, use `dbt.source()`. We saw this in our earlier section using SQL with the source function. These functions have the same execution, but with different syntax. Use the `dbt.ref()` method within a Python model to read data from other models (SQL or Python). These methods return DataFrames pointing to the upstream source, model, seed, or snapshot. +- `.config()`. Just like SQL models, there are three ways to configure Python models: + - In a dedicated `.yml` file, within the `models/` directory + - Within the model's `.py` file, using the `dbt.config()` method + - Calling the `dbt.config()` method will set configurations for your model within your `.py` file, similar to the `{{ config() }} macro` in `.sql` model files: + + ```python + def model(dbt, session): + + # setting configuration + dbt.config(materialized="table") + ``` + - There's a limit to how complex you can get with the `dbt.config()` method. It accepts only literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `.config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the config property in a [YAML file](/reference/resource-properties/config). Learn more about configurations [here](/reference/model-configs). + +## Prepare for machine learning: cleaning, encoding, and splits + +Now that we’ve gained insights and business intelligence about Formula 1 at a descriptive level, we want to extend our capabilities into prediction. We’re going to take the scenario where we censor the data. This means that we will pretend that we will train a model using earlier data and apply it to future data. In practice, this means we’ll take data from 2010-2019 to train our model and then predict 2020 data. + +In this section, we’ll be preparing our data to predict the final race position of a driver. + +At a high level we’ll be: + +- Creating new prediction features and filtering our dataset to active drivers +- Encoding our data (algorithms like numbers) and simplifying our target variable called `position` +- Splitting our dataset into training, testing, and validation + +### ML data prep + +1. To keep our project organized, we’ll need to create two new subfolders in our `ml` directory. Under the `ml` folder, make the subfolders `prep` and `train_predict`. +2. Create a new file under `ml/prep` called `ml_data_prep`. Copy the following code into the file and **Save**. + + ```python + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas"]) + + # get upstream data + fct_results = dbt.ref("fct_results").to_pandas() + + # provide years so we do not hardcode dates in filter command + start_year=2010 + end_year=2020 + + # describe the data for a full decade + data = fct_results.loc[fct_results['RACE_YEAR'].between(start_year, end_year)] + + # convert string to an integer + data['POSITION'] = data['POSITION'].astype(float) + + # we cannot have nulls if we want to use total pit stops + data['TOTAL_PIT_STOPS_PER_RACE'] = data['TOTAL_PIT_STOPS_PER_RACE'].fillna(0) + + # some of the constructors changed their name over the year so replacing old names with current name + mapping = {'Force India': 'Racing Point', 'Sauber': 'Alfa Romeo', 'Lotus F1': 'Renault', 'Toro Rosso': 'AlphaTauri'} + data['CONSTRUCTOR_NAME'].replace(mapping, inplace=True) + + # create confidence metrics for drivers and constructors + dnf_by_driver = data.groupby('DRIVER').sum()['DNF_FLAG'] + driver_race_entered = data.groupby('DRIVER').count()['DNF_FLAG'] + driver_dnf_ratio = (dnf_by_driver/driver_race_entered) + driver_confidence = 1-driver_dnf_ratio + driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence)) + + dnf_by_constructor = data.groupby('CONSTRUCTOR_NAME').sum()['DNF_FLAG'] + constructor_race_entered = data.groupby('CONSTRUCTOR_NAME').count()['DNF_FLAG'] + constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered) + constructor_relaiblity = 1-constructor_dnf_ratio + constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity)) + + data['DRIVER_CONFIDENCE'] = data['DRIVER'].apply(lambda x:driver_confidence_dict[x]) + data['CONSTRUCTOR_RELAIBLITY'] = data['CONSTRUCTOR_NAME'].apply(lambda x:constructor_relaiblity_dict[x]) + + #removing retired drivers and constructors + active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes', + 'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull', + 'Haas F1 Team'] + active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz', + 'Valtteri Bottas', 'Lance Stroll', 'George Russell', + 'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen', + 'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat', + 'Max Verstappen', 'Pierre Gasly', 'Alexander Albon', + 'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi', + 'Romain Grosjean','Nicholas Latifi'] + + # create flags for active drivers and constructors so we can filter downstream + data['ACTIVE_DRIVER'] = data['DRIVER'].apply(lambda x: int(x in active_drivers)) + data['ACTIVE_CONSTRUCTOR'] = data['CONSTRUCTOR_NAME'].apply(lambda x: int(x in active_constructors)) + + return data + ``` + +3. As usual, let’s break down what we are doing in this Python model: + - We’re first referencing our upstream `fct_results` table and casting it to a pandas dataframe. + - Filtering on years 2010-2020 since we’ll need to clean all our data we are using for prediction (both training and testing). + - Filling in empty data for `total_pit_stops` and making a mapping active constructors and drivers to avoid erroneous predictions + - ⚠️ You might be wondering why we didn’t do this upstream in our `fct_results` table! The reason for this is that we want our machine learning cleanup to reflect the year 2020 for our predictions and give us an up-to-date team name. However, for business intelligence purposes we can keep the historical data at that point in time. Instead of thinking of one table as “one source of truth” we are creating different datasets fit for purpose: one for historical descriptions and reporting and another for relevant predictions. + - Create new confidence features for drivers and constructors + - Generate flags for the constructors and drivers that were active in 2020 +4. Execute the following in the command bar: + + ```bash + dbt run --select ml_data_prep + ``` + +5. There are more aspects we could consider for this project, such as normalizing the driver confidence by the number of races entered. Including this would help account for a driver’s history and consider whether they are a new or long-time driver. We’re going to keep it simple for now, but these are some of the ways we can expand and improve our machine learning dbt projects. Breaking down our machine learning prep model: + - Lambda functions — We use some lambda functions to transform our data without having to create a fully-fledged function using the `def` notation. So what exactly are lambda functions? + - In Python, a lambda function is a small, anonymous function defined using the keyword "lambda". Lambda functions are used to perform a quick operation, such as a mathematical calculation or a transformation on a list of elements. They are often used in conjunction with higher-order functions, such as `apply`, `map`, `filter`, and `reduce`. + - `.apply()` method — We used `.apply()` to pass our functions into our lambda expressions to the columns and perform this multiple times in our code. Let’s explain apply a little more: + - The `.apply()` function in the pandas library is used to apply a function to a specified axis of a DataFrame or a Series. In our case the function we used was our lambda function! + - The `.apply()` function takes two arguments: the first is the function to be applied, and the second is the axis along which the function should be applied. The axis can be specified as 0 for rows or 1 for columns. We are using the default value of 0 so we aren’t explicitly writing it in the code. This means that the function will be applied to each *row* of the DataFrame or Series. +6. Let’s look at the preview of our clean dataframe after running our `ml_data_prep` model: + + +### Covariate encoding + +In this next part, we’ll be performing covariate encoding. Breaking down this phrase a bit, a *covariate* is a variable that is relevant to the outcome of a study or experiment, and *encoding* refers to the process of converting data (such as text or categorical variables) into a numerical format that can be used as input for a model. This is necessary because most machine learning algorithms can only work with numerical data. Algorithms don’t speak languages, have eyes to see images, etc. so we encode our data into numbers so algorithms can perform tasks by using calculations they otherwise couldn’t. + +🧠 We’ll think about this as : “algorithms like numbers”. + +1. Create a new file under `ml/prep` called `covariate_encoding` copy the code below and save. + + ```python + import pandas as pd + import numpy as np + from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder + from sklearn.linear_model import LogisticRegression + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas","numpy","scikit-learn"]) + + # get upstream data + data = dbt.ref("ml_data_prep").to_pandas() + + # list out covariates we want to use in addition to outcome variable we are modeling - position + covariates = data[['RACE_YEAR','CIRCUIT_NAME','GRID','CONSTRUCTOR_NAME','DRIVER','DRIVERS_AGE_YEARS','DRIVER_CONFIDENCE','CONSTRUCTOR_RELAIBLITY','TOTAL_PIT_STOPS_PER_RACE','ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR', 'POSITION']] + + # filter covariates on active drivers and constructors + # use fil_cov as short for "filtered_covariates" + fil_cov = covariates[(covariates['ACTIVE_DRIVER']==1)&(covariates['ACTIVE_CONSTRUCTOR']==1)] + + # Encode categorical variables using LabelEncoder + # TODO: we'll update this to both ohe in the future for non-ordinal variables! + le = LabelEncoder() + fil_cov['CIRCUIT_NAME'] = le.fit_transform(fil_cov['CIRCUIT_NAME']) + fil_cov['CONSTRUCTOR_NAME'] = le.fit_transform(fil_cov['CONSTRUCTOR_NAME']) + fil_cov['DRIVER'] = le.fit_transform(fil_cov['DRIVER']) + fil_cov['TOTAL_PIT_STOPS_PER_RACE'] = le.fit_transform(fil_cov['TOTAL_PIT_STOPS_PER_RACE']) + + # Simply target variable "position" to represent 3 meaningful categories in Formula1 + # 1. Podium position 2. Points for team 3. Nothing - no podium or points! + def position_index(x): + if x<4: + return 1 + if x>10: + return 3 + else : + return 2 + + # we are dropping the columns that we filtered on in addition to our training variable + encoded_data = fil_cov.drop(['ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR'],1) + encoded_data['POSITION_LABEL']= encoded_data['POSITION'].apply(lambda x: position_index(x)) + encoded_data_grouped_target = encoded_data.drop(['POSITION'],1) + + return encoded_data_grouped_target + ``` + +2. Execute the following in the command bar: + + ```bash + dbt run --select covariate_encoding + ``` + +3. In this code, we are using a ton of functions from libraries! This is really cool, because we can utilize code other people have developed and bring it into our project simply by using the `import` function. [Scikit-learn](https://scikit-learn.org/stable/), “sklearn” for short, is an extremely popular data science library. Sklearn contains a wide range of machine learning techniques, including supervised and unsupervised learning algorithms, feature scaling and imputation, as well as tools model evaluation and selection. We’ll be using Sklearn for both preparing our covariates and creating models (our next section). +4. Our dataset is pretty small data so we are good to use pandas and `sklearn`. If you have larger data for your own project in mind, consider `dask` or `category_encoders`. +5. Breaking it down a bit more: + - We’re selecting a subset of variables that will be used as predictors for a driver’s position. + - Filter the dataset to only include rows using the active driver and constructor flags we created in the last step. + - The next step is to use the `LabelEncoder` from scikit-learn to convert the categorical variables `CIRCUIT_NAME`, `CONSTRUCTOR_NAME`, `DRIVER`, and `TOTAL_PIT_STOPS_PER_RACE` into numerical values. + - Create a new variable called `POSITION_LABEL`, which is a derived from our position variable. + - 💭 Why are we changing our position variable? There are 20 total positions in Formula 1 and we are grouping them together to simplify the classification and improve performance. We also want to demonstrate you can create a new function within your dbt model! + - Our new `position_label` variable has meaning: + - In Formula1 if you are in: + - Top 3 you get a “podium” position + - Top 10 you gain points that add to your overall season total + - Below top 10 you get no points! + - We are mapping our original variable position to `position_label` to the corresponding places above to 1,2, and 3 respectively. + - Drop the active driver and constructor flags since they were filter criteria and additionally drop our original position variable. + +### Splitting into training and testing datasets + +Now that we’ve cleaned and encoded our data, we are going to further split in by time. In this step, we will create dataframes to use for training and prediction. We’ll be creating two dataframes 1) using data from 2010-2019 for training, and 2) data from 2020 for new prediction inferences. We’ll create variables called `start_year` and `end_year` so we aren’t filtering on hardcasted values (and can more easily swap them out in the future if we want to retrain our model on different timeframes). + +1. Create a file called `train_test_dataset` copy and save the following code: + + ```python + import pandas as pd + + def model(dbt, session): + + # dbt configuration + dbt.config(packages=["pandas"], tags="train") + + # get upstream data + encoding = dbt.ref("covariate_encoding").to_pandas() + + # provide years so we do not hardcode dates in filter command + start_year=2010 + end_year=2019 + + # describe the data for a full decade + train_test_dataset = encoding.loc[encoding['RACE_YEAR'].between(start_year, end_year)] + + return train_test_dataset + ``` + +2. Create a file called `hold_out_dataset_for_prediction` copy and save the following code below. Now we’ll have a dataset with only the year 2020 that we’ll keep as a hold out set that we are going to use similar to a deployment use case. + + ```python + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas"], tags="predict") + + # get upstream data + encoding = dbt.ref("covariate_encoding").to_pandas() + + # variable for year instead of hardcoding it + year=2020 + + # filter the data based on the specified year + hold_out_dataset = encoding.loc[encoding['RACE_YEAR'] == year] + + return hold_out_dataset + ``` + +3. Execute the following in the command bar: + + ```bash + dbt run --select train_test_dataset hold_out_dataset_for_prediction + ``` + + To run our temporal data split models, we can use this syntax in the command line to run them both at once. Make sure you use a *space* [syntax](/reference/node-selection/syntax) between the model names to indicate you want to run both! +4. **Commit and push** our changes to keep saving our work as we go using `ml data prep and splits` before moving on. + +👏 Now that we’ve finished our machine learning prep work we can move onto the fun part — training and prediction! + + +## Training a model to predict in machine learning + +We’re ready to start training a model to predict the driver’s position. Now is a good time to pause and take a step back and say, usually in ML projects you’ll try multiple algorithms during development and use an evaluation method such as cross validation to determine which algorithm to use. You can definitely do this in your dbt project, but for the content of this lab we’ll have decided on using a logistic regression to predict position (we actually tried some other algorithms using cross validation outside of this lab such as k-nearest neighbors and a support vector classifier but that didn’t perform as well as the logistic regression and a decision tree that overfit). + +There are 3 areas to break down as we go since we are working at the intersection all within one model file: + +1. Machine Learning +2. Snowflake and Snowpark +3. dbt Python models + +If you haven’t seen code like this before or use joblib files to save machine learning models, we’ll be going over them at a high level and you can explore the links for more technical in-depth along the way! Because Snowflake and dbt have abstracted away a lot of the nitty gritty about serialization and storing our model object to be called again, we won’t go into too much detail here. There’s *a lot* going on here so take it at your pace! + +### Training and saving a machine learning model + +1. Project organization remains key, so let’s make a new subfolder called `train_predict` under the `ml` folder. +2. Now create a new file called `train_test_position` and copy and save the following code: + + ```python + import snowflake.snowpark.functions as F + from sklearn.model_selection import train_test_split + import pandas as pd + from sklearn.metrics import confusion_matrix, balanced_accuracy_score + import io + from sklearn.linear_model import LogisticRegression + from joblib import dump, load + import joblib + import logging + import sys + from joblib import dump, load + + logger = logging.getLogger("mylog") + + def save_file(session, model, path, dest_filename): + input_stream = io.BytesIO() + joblib.dump(model, input_stream) + session._conn.upload_stream(input_stream, path, dest_filename) + return "successfully created file: " + path + + def model(dbt, session): + dbt.config( + packages = ['numpy','scikit-learn','pandas','numpy','joblib','cachetools'], + materialized = "table", + tags = "train" + ) + # Create a stage in Snowflake to save our model file + session.sql('create or replace stage MODELSTAGE').collect() + + #session._use_scoped_temp_objects = False + version = "1.0" + logger.info('Model training version: ' + version) + + # read in our training and testing upstream dataset + test_train_df = dbt.ref("train_test_dataset") + + # cast snowpark df to pandas df + test_train_pd_df = test_train_df.to_pandas() + target_col = "POSITION_LABEL" + + # split out covariate predictors, x, from our target column position_label, y. + split_X = test_train_pd_df.drop([target_col], axis=1) + split_y = test_train_pd_df[target_col] + + # Split out our training and test data into proportions + X_train, X_test, y_train, y_test = train_test_split(split_X, split_y, train_size=0.7, random_state=42) + train = [X_train, y_train] + test = [X_test, y_test] + # now we are only training our one model to deploy + # we are keeping the focus on the workflows and not algorithms for this lab! + model = LogisticRegression() + + # fit the preprocessing pipeline and the model together + model.fit(X_train, y_train) + y_pred = model.predict_proba(X_test)[:,1] + predictions = [round(value) for value in y_pred] + balanced_accuracy = balanced_accuracy_score(y_test, predictions) + + # Save the model to a stage + save_file(session, model, "@MODELSTAGE/driver_position_"+version, "driver_position_"+version+".joblib" ) + logger.info('Model artifact:' + "@MODELSTAGE/driver_position_"+version+".joblib") + + # Take our pandas training and testing dataframes and put them back into snowpark dataframes + snowpark_train_df = session.write_pandas(pd.concat(train, axis=1, join='inner'), "train_table", auto_create_table=True, create_temp_table=True) + snowpark_test_df = session.write_pandas(pd.concat(test, axis=1, join='inner'), "test_table", auto_create_table=True, create_temp_table=True) + + # Union our training and testing data together and add a column indicating train vs test rows + return snowpark_train_df.with_column("DATASET_TYPE", F.lit("train")).union(snowpark_test_df.with_column("DATASET_TYPE", F.lit("test"))) + ``` + +3. Execute the following in the command bar: + + ```bash + dbt run --select train_test_position + ``` + +4. Breaking down our Python script here: + - We’re importing some helpful libraries. + - Defining a function called `save_file()` that takes four parameters: `session`, `model`, `path` and `dest_filename` that will save our logistic regression model file. + - `session` — an object representing a connection to Snowflake. + - `model` — an object that needs to be saved. In this case, it's a Python object that is a scikit-learn that can be serialized with joblib. + - `path` — a string representing the directory or bucket location where the file should be saved. + - `dest_filename` — a string representing the desired name of the file. + - Creating our dbt model + - Within this model we are creating a stage called `MODELSTAGE` to place our logistic regression `joblib` model file. This is really important since we need a place to keep our model to reuse and want to ensure it's there. When using Snowpark commands, it's common to see the `.collect()` method to ensure the action is performed. Think of the session as our “start” and collect as our “end” when [working with Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) (you can use other ending methods other than collect). + - Using `.ref()` to connect into our `train_test_dataset` model. + - Now we see the machine learning part of our analysis: + - Create new dataframes for our prediction features from our target variable `position_label`. + - Split our dataset into 70% training (and 30% testing), train_size=0.7 with a `random_state` specified to have repeatable results. + - Specify our model is a logistic regression. + - Fit our model. In a logistic regression this means finding the coefficients that will give the least classification error. + - Round our predictions to the nearest integer since logistic regression creates a probability between for each class and calculate a balanced accuracy to account for imbalances in the target variable. + - Right now our model is only in memory, so we need to use our nifty function `save_file` to save our model file to our Snowflake stage. We save our model as a joblib file so Snowpark can easily call this model object back to create predictions. We really don’t need to know much else as a data practitioner unless we want to. It’s worth noting that joblib files aren’t able to be queried directly by SQL. To do this, we would need to transform the joblib file to an SQL querable format such as JSON or CSV (out of scope for this workshop). + - Finally we want to return our dataframe, but create a new column indicating what rows were used for training and those for training. +5. Viewing our output of this model: + + +6. Let’s pop back over to Snowflake and check that our logistic regression model has been stored in our `MODELSTAGE` using the command: + + ```sql + list @modelstage + ``` + + + +7. To investigate the commands run as part of `train_test_position` script, navigate to Snowflake query history to view it **Activity > Query History**. We can view the portions of query that we wrote such as `create or replace stage MODELSTAGE`, but we also see additional queries that Snowflake uses to interpret python code. + + +### Predicting on new data + +1. Create a new file called `predict_position` and copy and save the following code: + + ```python + import logging + import joblib + import pandas as pd + import os + from snowflake.snowpark import types as T + + DB_STAGE = 'MODELSTAGE' + version = '1.0' + # The name of the model file + model_file_path = 'driver_position_'+version + model_file_packaged = 'driver_position_'+version+'.joblib' + + # This is a local directory, used for storing the various artifacts locally + LOCAL_TEMP_DIR = f'/tmp/driver_position' + DOWNLOAD_DIR = os.path.join(LOCAL_TEMP_DIR, 'download') + TARGET_MODEL_DIR_PATH = os.path.join(LOCAL_TEMP_DIR, 'ml_model') + TARGET_LIB_PATH = os.path.join(LOCAL_TEMP_DIR, 'lib') + + # The feature columns that were used during model training + # and that will be used during prediction + FEATURE_COLS = [ + "RACE_YEAR" + ,"CIRCUIT_NAME" + ,"GRID" + ,"CONSTRUCTOR_NAME" + ,"DRIVER" + ,"DRIVERS_AGE_YEARS" + ,"DRIVER_CONFIDENCE" + ,"CONSTRUCTOR_RELAIBLITY" + ,"TOTAL_PIT_STOPS_PER_RACE"] + + def register_udf_for_prediction(p_predictor ,p_session ,p_dbt): + + # The prediction udf + + def predict_position(p_df: T.PandasDataFrame[int, int, int, int, + int, int, int, int, int]) -> T.PandasSeries[int]: + # Snowpark currently does not set the column name in the input dataframe + # The default col names are like 0,1,2,... Hence we need to reset the column + # names to the features that we initially used for training. + p_df.columns = [*FEATURE_COLS] + + # Perform prediction. this returns an array object + pred_array = p_predictor.predict(p_df) + # Convert to series + df_predicted = pd.Series(pred_array) + return df_predicted + + # The list of packages that will be used by UDF + udf_packages = p_dbt.config.get('packages') + + predict_position_udf = p_session.udf.register( + predict_position + ,name=f'predict_position' + ,packages = udf_packages + ) + return predict_position_udf + + def download_models_and_libs_from_stage(p_session): + p_session.file.get(f'@{DB_STAGE}/{model_file_path}/{model_file_packaged}', DOWNLOAD_DIR) + + def load_model(p_session): + # Load the model and initialize the predictor + model_fl_path = os.path.join(DOWNLOAD_DIR, model_file_packaged) + predictor = joblib.load(model_fl_path) + return predictor + + # ------------------------------- + def model(dbt, session): + dbt.config( + packages = ['snowflake-snowpark-python' ,'scipy','scikit-learn' ,'pandas' ,'numpy'], + materialized = "table", + tags = "predict" + ) + session._use_scoped_temp_objects = False + download_models_and_libs_from_stage(session) + predictor = load_model(session) + predict_position_udf = register_udf_for_prediction(predictor, session ,dbt) + + # Retrieve the data, and perform the prediction + hold_out_df = (dbt.ref("hold_out_dataset_for_prediction") + .select(*FEATURE_COLS) + ) + + # Perform prediction. + new_predictions_df = hold_out_df.withColumn("position_predicted" + ,predict_position_udf(*FEATURE_COLS) + ) + + return new_predictions_df + ``` + +2. Execute the following in the command bar: + + ```bash + dbt run --select predict_position + ``` + +3. **Commit and push** our changes to keep saving our work as we go using the commit message `logistic regression model training and application` before moving on. +4. At a high level in this script, we are: + - Retrieving our staged logistic regression model + - Loading the model in + - Placing the model within a user defined function (UDF) to call in line predictions on our driver’s position +5. At a more detailed level: + - Import our libraries. + - Create variables to reference back to the `MODELSTAGE` we just created and stored our model to. + - The temporary file paths we created might look intimidating, but all we’re doing here is programmatically using an initial file path and adding to it to create the following directories: + - LOCAL_TEMP_DIR ➡️ /tmp/driver_position + - DOWNLOAD_DIR ➡️ /tmp/driver_position/download + - TARGET_MODEL_DIR_PATH ➡️ /tmp/driver_position/ml_model + - TARGET_LIB_PATH ➡️ /tmp/driver_position/lib + - Provide a list of our feature columns that we used for model training and will now be used on new data for prediction. + - Next, we are creating our main function `register_udf_for_prediction(p_predictor ,p_session ,p_dbt):`. This function is used to register a user-defined function (UDF) that performs the machine learning prediction. It takes three parameters: `p_predictor` is an instance of the machine learning model, `p_session` is an instance of the Snowflake session, and `p_dbt` is an instance of the dbt library. The function creates a UDF named `predict_churn` which takes a pandas dataframe with the input features and returns a pandas series with the predictions. + - ⚠️ Pay close attention to the whitespace here. We are using a function within a function for this script. + - We have 2 simple functions that are programmatically retrieving our file paths to first get our stored model out of our `MODELSTAGE` and downloaded into the session `download_models_and_libs_from_stage` and then to load the contents of our model in (parameters) in `load_model` to use for prediction. + - Take the model we loaded in and call it `predictor` and wrap it in a UDF. + - Return our dataframe with both the features used to predict and the new label. + +🧠 Another way to read this script is from the bottom up. This can help us progressively see what is going into our final dbt model and work backwards to see how the other functions are being referenced. + +6. Let’s take a look at our predicted position alongside our feature variables. Open a new scratchpad and use the following query. I chose to order by the prediction of who would obtain a podium position: + + ```sql + select * from {{ ref('predict_position') }} order by position_predicted + ``` + +7. We can see that we created predictions in our final dataset, we are ready to move on to testing! + +## Test your data models + +We have now completed building all the models for today’s lab, but how do we know if they meet our assertions? Put another way, how do we know the quality of our data models are any good? This brings us to testing! + +We test data models for mainly two reasons: + +- Ensure that our source data is clean on ingestion before we start data modeling/transformation (aka avoid garbage in, garbage out problem). +- Make sure we don’t introduce bugs in the transformation code we wrote (stop ourselves from creating bad joins/fanouts). + +Testing in dbt comes in two flavors: [generic](/docs/build/tests#generic-tests) and [singular](/docs/build/tests#singular-tests). + +You define them in a test block (similar to a macro) and once defined, you can reference them by name in your `.yml` files (applying them to models, columns, sources, snapshots, and seeds). + +You might be wondering: *what about testing Python models?* + +Since the output of our Python models are tables, we can test SQL and Python models the same way! We don’t have to worry about any syntax differences when testing SQL versus Python data models. This means we use `.yml` and `.sql` files to test our entities (tables, views, etc.). Under the hood, dbt is running an SQL query on our tables to see if they meet assertions. If no rows are returned, dbt will surface a passed test. Conversely, if a test results in returned rows, it will fail or warn depending on the configuration (more on that later). + +### Generic tests + +1. To implement generic out-of-the-box tests dbt comes with, we can use YAML files to specify information about our models. To add generic tests to our aggregates model, create a file called `aggregates.yml`, copy the code block below into the file, and save. + + + ```yaml + version: 2 + + models: + - name: fastest_pit_stops_by_constructor + description: Use the python .describe() method to retrieve summary statistics table about pit stops by constructor. Sort by average stop time ascending so the first row returns the fastest constructor. + columns: + - name: constructor_name + description: team that makes the car + tests: + - unique + + - name: lap_times_moving_avg + description: Use the python .rolling() method to calculate the 5 year rolling average of pit stop times alongside the average for each year. + columns: + - name: race_year + description: year of the race + tests: + - relationships: + to: ref('int_lap_times_years') + field: race_year + ``` + +2. Let’s unpack the code we have here. We have both our aggregates models with the model name to know the object we are referencing and the description of the model that we’ll populate in our documentation. At the column level (a level below our model), we are providing the column name followed by our tests. We want to ensure our `constructor_name` is unique since we used a pandas `groupby` on `constructor_name` in the model `fastest_pit_stops_by_constructor`. Next, we want to ensure our `race_year` has referential integrity from the model we selected from `int_lap_times_years` into our subsequent `lap_times_moving_avg` model. +3. Finally, if we want to see how tests were deployed on sources and SQL models, we can look at other files in our project such as the `f1_sources.yml` we created in our Sources and staging section. + +### Using macros for testing + +1. Under your `macros` folder, create a new file and name it `test_all_values_gte_zero.sql`. Copy the code block below and save the file. For clarity, “gte” is an abbreviation for greater than or equal to. + + + ```sql + {% macro test_all_values_gte_zero(table, column) %} + + select * from {{ ref(table) }} where {{ column }} < 0 + + {% endmacro %} + ``` + +2. Macros in Jinja are pieces of code that can be reused multiple times in our SQL models — they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models. +3. We use the `{% macro %}` to indicate the start of the macro and `{% endmacro %}` for the end. The text after the beginning of the macro block is the name we are giving the macro to later call it. In this case, our macro is called `test_all_values_gte_zero`. Macros take in *arguments* to pass through, in this case the `table` and the `column`. In the body of the macro, we see an SQL statement that is using the `ref` function to dynamically select the table and then the column. You can always view macros without having to run them by using `dbt run-operation`. You can learn more [here](https://docs.getdbt.com/reference/commands/run-operation). +4. Great, now we want to reference this macro as a test! Let’s create a new test file called `macro_pit_stops_mean_is_positive.sql` in our `tests` folder. + + + +5. Copy the following code into the file and save: + + ```sql + {{ + config( + enabled=true, + severity='warn', + tags = ['bi'] + ) + }} + + {{ test_all_values_gte_zero('fastest_pit_stops_by_constructor', 'mean') }} + ``` + +6. In our testing file, we are applying some configurations to the test including `enabled`, which is an optional configuration for disabling models, seeds, snapshots, and tests. Our severity is set to `warn` instead of `error`, which means our pipeline will still continue to run. We have tagged our test with `bi` since we are applying this test to one of our bi models. + +Then, in our final line, we are calling the `test_all_values_gte_zero` macro that takes in our table and column arguments and inputting our table `'fastest_pit_stops_by_constructor'` and the column `'mean'`. + +### Custom singular tests to validate Python models + +The simplest way to define a test is by writing the exact SQL that will return failing records. We call these "singular" tests, because they're one-off assertions usable for a single purpose. + +These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your test-paths config). You can use Jinja in SQL models (including ref and source) in the test definition, just like you can when creating models. Each `.sql` file contains one select statement, and it defines one test. + +Let’s add a custom test that asserts that the moving average of the lap time over the last 5 years is greater than zero (it’s impossible to have time less than 0!). It is easy to assume if this is not the case the data has been corrupted. + +1. Create a file `lap_times_moving_avg_assert_positive_or_null.sql` under the `tests` folder. + + +2. Copy the following code and save the file: + + ```sql + {{ + config( + enabled=true, + severity='error', + tags = ['bi'] + ) + }} + + with lap_times_moving_avg as ( select * from {{ ref('lap_times_moving_avg') }} ) + + select * + from lap_times_moving_avg + where lap_moving_avg_5_years < 0 and lap_moving_avg_5_years is not null + ``` + +### Putting all our tests together + +1. Time to run our tests! Altogether, we have created 4 tests for our 2 Python models: + - `fastest_pit_stops_by_constructor` + - Unique `constructor_name` + - Lap times are greater than 0 or null (to allow for the first leading values in a rolling calculation) + - `lap_times_moving_avg` + - Referential test on `race_year` + - Mean pit stop times are greater than or equal to 0 (no negative time values) +2. To run the tests on both our models, we can use this syntax in the command line to run them both at once, similar to how we did our data splits earlier. + Execute the following in the command bar: + + ```bash + dbt test --select fastest_pit_stops_by_constructor lap_times_moving_avg + ``` + + + +3. All 4 of our tests passed (yay for clean data)! To understand the SQL being run against each of our tables, we can click into the details of the test. +4. Navigating into the **Details** of the `unique_fastest_pit_stops_by_constructor_name`, we can see that each line `constructor_name` should only have one row. + + +## Document your dbt project + +When it comes to documentation, dbt brings together both column and model level descriptions that you can provide as well as details from your Snowflake information schema in a static site for consumption by other data team members and stakeholders. + +We are going to revisit 2 areas of our project to understand our documentation: + +- `intermediate.md` file +- `dbt_project.yml` file + +To start, let’s look back at our `intermediate.md` file. We can see that we provided multi-line descriptions for the models in our intermediate models using [docs blocks](/docs/collaborate/documentation#using-docs-blocks). Then we reference these docs blocks in our `.yml` file. Building descriptions with doc blocks in Markdown files gives you the ability to format your descriptions with Markdown and are particularly helpful when building long descriptions, either at the column or model level. In our `dbt_project.yml`, we added `node_colors` at folder levels. + +1. To see all these pieces come together, execute this in the command bar: + + ```bash + dbt docs generate + ``` + + This will generate the documentation for your project. Click the book button, as shown in the screenshot below to access the docs. + + +2. Go to our project area and view `int_results`. View the description that we created in our doc block. + + +3. View the mini-lineage that looks at the model we are currently selected on (`int_results` in this case). + + +4. In our `dbt_project.yml`, we configured `node_colors` depending on the file directory. Starting in dbt v1.3, we can see how our lineage in our docs looks. By color coding your project, it can help you cluster together similar models or steps and more easily troubleshoot. + + + +## Deploy your code + +Before we jump into deploying our code, let's have a quick primer on environments. Up to this point, all of the work we've done in the dbt Cloud IDE has been in our development environment, with code committed to a feature branch and the models we've built created in our development schema in Snowflake as defined in our Development environment connection. Doing this work on a feature branch, allows us to separate our code from what other coworkers are building and code that is already deemed production ready. Building models in a development schema in Snowflake allows us to separate the database objects we might still be modifying and testing from the database objects running production dashboards or other downstream dependencies. Together, the combination of a Git branch and Snowflake database objects form our environment. + +Now that we've completed testing and documenting our work, we're ready to deploy our code from our development environment to our production environment and this involves two steps: + +- Promoting code from our feature branch to the production branch in our repository. + - Generally, the production branch is going to be named your main branch and there's a review process to go through before merging code to the main branch of a repository. Here we are going to merge without review for ease of this workshop. +- Deploying code to our production environment. + - Once our code is merged to the main branch, we'll need to run dbt in our production environment to build all of our models and run all of our tests. This will allow us to build production-ready objects into our production environment in Snowflake. Luckily for us, the Partner Connect flow has already created our deployment environment and job to facilitate this step. + +1. Before getting started, let's make sure that we've committed all of our work to our feature branch. If you still have work to commit, you'll be able to select the **Commit and push**, provide a message, and then select **Commit** again. +2. Once all of your work is committed, the git workflow button will now appear as **Merge to main**. Select **Merge to main** and the merge process will automatically run in the background. + + +3. When it's completed, you should see the git button read **Create branch** and the branch you're currently looking at will become **main**. +4. Now that all of our development work has been merged to the main branch, we can build our deployment job. Given that our production environment and production job were created automatically for us through Partner Connect, all we need to do here is update some default configurations to meet our needs. +5. In the menu, select **Deploy** **> Environments** + + +6. You should see two environments listed and you'll want to select the **Deployment** environment then **Settings** to modify it. +7. Before making any changes, let's touch on what is defined within this environment. The Snowflake connection shows the credentials that dbt Cloud is using for this environment and in our case they are the same as what was created for us through Partner Connect. Our deployment job will build in our `PC_DBT_DB` database and use the default Partner Connect role and warehouse to do so. The deployment credentials section also uses the info that was created in our Partner Connect job to create the credential connection. However, it is using the same default schema that we've been using as the schema for our development environment. +8. Let's update the schema to create a new schema specifically for our production environment. Click **Edit** to allow you to modify the existing field values. Navigate to **Deployment Credentials >** **schema.** +9. Update the schema name to **production**. Remember to select **Save** after you've made the change. + +10. By updating the schema for our production environment to **production**, it ensures that our deployment job for this environment will build our dbt models in the **production** schema within the `PC_DBT_DB` database as defined in the Snowflake Connection section. +11. Now let's switch over to our production job. Click on the deploy tab again and then select **Jobs**. You should see an existing and preconfigured **Partner Connect Trial Job**. Similar to the environment, click on the job, then select **Settings** to modify it. Let's take a look at the job to understand it before making changes. + + - The Environment section is what connects this job with the environment we want it to run in. This job is already defaulted to use the Deployment environment that we just updated and the rest of the settings we can keep as is. + - The Execution settings section gives us the option to generate docs, run source freshness, and defer to a previous run state. For the purposes of our lab, we're going to keep these settings as is as well and stick with just generating docs. + - The Commands section is where we specify exactly which commands we want to run during this job, and we also want to keep this as is. We want our seed to be uploaded first, then run our models, and finally test them. The order of this is important as well, considering that we need our seed to be created before we can run our incremental model, and we need our models to be created before we can test them. + - Finally, we have the Triggers section, where we have a number of different options for scheduling our job. Given that our data isn't updating regularly here and we're running this job manually for now, we're also going to leave this section alone. + + So, what are we changing then? Just the name! Click **Edit** to allow you to make changes. Then update the name of the job to **Production Job** to denote this as our production deployment job. After that's done, click **Save**. +12. Now let's go to run our job. Clicking on the job name in the path at the top of the screen will take you back to the job run history page where you'll be able to click **Run run** to kick off the job. If you encounter any job failures, try running the job again before further troubleshooting. + + + +13. Let's go over to Snowflake to confirm that everything built as expected in our production schema. Refresh the database objects in your Snowflake account and you should see the production schema now within our default Partner Connect database. If you click into the schema and everything ran successfully, you should be able to see all of the models we developed. + + +### Conclusion + +Fantastic! You’ve finished the workshop! We hope you feel empowered in using both SQL and Python in your dbt Cloud workflows with Snowflake. Having a reliable pipeline to surface both analytics and machine learning is crucial to creating tangible business value from your data. + +For more help and information join our [dbt community Slack](https://www.getdbt.com/community/) which contains more than 50,000 data practitioners today. We have a dedicated slack channel #db-snowflake to Snowflake related content. Happy dbt'ing! diff --git a/website/docs/guides/best-practices/debugging-errors.md b/website/docs/guides/debug-errors.md similarity index 90% rename from website/docs/guides/best-practices/debugging-errors.md rename to website/docs/guides/debug-errors.md index 39670820ddd..febfb6ac422 100644 --- a/website/docs/guides/best-practices/debugging-errors.md +++ b/website/docs/guides/debug-errors.md @@ -1,13 +1,18 @@ --- -title: "Debugging errors" -id: "debugging-errors" +title: "Debug errors" +id: "debug-errors" description: Learn about errors and the art of debugging them. displayText: Debugging errors hoverSnippet: Learn about errors and the art of debugging those errors. +icon: 'guides' +hide_table_of_contents: true +tags: ['Troubleshooting', 'dbt Core', 'dbt Cloud'] +level: 'Beginner' +recently_updated: true --- - ## General process of debugging + Learning how to debug is a skill, and one that will make you great at your role! 1. Read the error message — when writing the code behind dbt, we try our best to make error messages as useful as we can. The error message dbt produces will normally contain the type of error (more on these error types below), and the file where the error occurred. 2. Inspect the file that was known to cause the issue, and see if there's an immediate fix. @@ -17,7 +22,7 @@ Learning how to debug is a skill, and one that will make you great at your role! - The `target/run` directory contains the SQL dbt executes to build your models. - The `logs/dbt.log` file contains all the queries that dbt runs, and additional logging. Recent errors will be at the bottom of the file. - **dbt Cloud users**: Use the above, or the `Details` tab in the command output. - - **dbt CLI users**: Note that your code editor _may_ be hiding these files from the tree [VSCode help](https://stackoverflow.com/questions/42891463/how-can-i-show-ignored-files-in-visual-studio-code)). + - **dbt Core users**: Note that your code editor _may_ be hiding these files from the tree [VSCode help](https://stackoverflow.com/questions/42891463/how-can-i-show-ignored-files-in-visual-studio-code)). 5. If you are really stuck, try [asking for help](/community/resources/getting-help). Before doing so, take the time to write your question well so that others can diagnose the problem quickly. @@ -184,7 +189,7 @@ hello: world # this is not allowed ## Compilation Errors -_Note: if you're using the dbt Cloud IDE to work on your dbt project, this error often shows as a red bar in your command prompt as you work on your dbt project. For dbt CLI users, these won't get picked up until you run `dbt run` or `dbt compile`._ +_Note: if you're using the dbt Cloud IDE to work on your dbt project, this error often shows as a red bar in your command prompt as you work on your dbt project. For dbt Core users, these won't get picked up until you run `dbt run` or `dbt compile`._ ### Invalid `ref` function @@ -228,7 +233,7 @@ To fix this: - Use the error message to find your mistake To prevent this: -- _(dbt CLI users only)_ Use snippets to auto-complete pieces of Jinja ([atom-dbt package](https://github.com/dbt-labs/atom-dbt), [vscode-dbt extestion](https://marketplace.visualstudio.com/items?itemName=bastienboutonnet.vscode-dbt)) +- _(dbt Core users only)_ Use snippets to auto-complete pieces of Jinja ([atom-dbt package](https://github.com/dbt-labs/atom-dbt), [vscode-dbt extestion](https://marketplace.visualstudio.com/items?itemName=bastienboutonnet.vscode-dbt)) @@ -280,7 +285,7 @@ To fix this: - Find the mistake and fix it To prevent this: -- (dbt CLI users) Turn on indentation guides in your code editor to help you inspect your files +- (dbt Core users) Turn on indentation guides in your code editor to help you inspect your files - Use a YAML validator ([example](http://www.yamllint.com/)) to debug any issues @@ -341,10 +346,10 @@ Database Error in model customers (models/customers.sql) 90% of the time, there's a mistake in the SQL of your model. To fix this: 1. Open the offending file: - **dbt Cloud:** Open the model (in this case `models/customers.sql` as per the error message) - - **dbt CLI:** Open the model as above. Also open the compiled SQL (in this case `target/run/jaffle_shop/models/customers.sql` as per the error message) — it can be useful to show these side-by-side in your code editor. + - **dbt Core:** Open the model as above. Also open the compiled SQL (in this case `target/run/jaffle_shop/models/customers.sql` as per the error message) — it can be useful to show these side-by-side in your code editor. 2. Try to re-execute the SQL to isolate the error: - **dbt Cloud:** Use the `Preview` button from the model file - - **dbt CLI:** Copy and paste the compiled query into a query runner (e.g. the Snowflake UI, or a desktop app like DataGrip / TablePlus) and execute it + - **dbt Core:** Copy and paste the compiled query into a query runner (e.g. the Snowflake UI, or a desktop app like DataGrip / TablePlus) and execute it 3. Fix the mistake. 4. Rerun the failed model. @@ -356,7 +361,7 @@ In some cases, these errors might occur as a result of queries that dbt runs "be In these cases, you should check out the logs — this contains _all_ the queries dbt has run. - **dbt Cloud**: Use the `Details` in the command output to see logs, or check the `logs/dbt.log` file -- **dbt CLI**: Open the `logs/dbt.log` file. +- **dbt Core**: Open the `logs/dbt.log` file. :::tip Isolating errors in the logs If you're hitting a strange `Database Error`, it can be a good idea to clean out your logs by opening the file, and deleting the contents. Then, re-execute `dbt run` for _just_ the problematic model. The logs will _just_ have the output you're looking for. @@ -379,6 +384,6 @@ Using the `Preview` button is useful when developing models and you want to visu We’ve all been there. dbt uses the last-saved version of a file when you execute a command. In most code editors, and in the dbt Cloud IDE, a dot next to a filename indicates that a file has unsaved changes. Make sure you hit `cmd + s` (or equivalent) before running any dbt commands — over time it becomes muscle memory. ### Editing compiled files -_(More likely for dbt CLI users)_ +_(More likely for dbt Core users)_ If you just opened a SQL file in the `target/` directory to help debug an issue, it's not uncommon to accidentally edit that file! To avoid this, try changing your code editor settings to grey out any files in the `target/` directory — the visual cue will help avoid the issue. diff --git a/website/docs/guides/legacy/debugging-schema-names.md b/website/docs/guides/debug-schema-names.md similarity index 81% rename from website/docs/guides/legacy/debugging-schema-names.md rename to website/docs/guides/debug-schema-names.md index 6c869b5f8af..c7bf1a195b1 100644 --- a/website/docs/guides/legacy/debugging-schema-names.md +++ b/website/docs/guides/debug-schema-names.md @@ -1,7 +1,19 @@ --- -title: Debugging schema names +title: Debug schema names +id: debug-schema-names +description: Learn how to debug schema names when models build under unexpected schemas. +displayText: Debug schema names +hoverSnippet: Learn how to debug schema names in dbt. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['dbt Core','Troubleshooting'] +level: 'Advanced' +recently_updated: true --- +## Introduction + If a model uses the [`schema` config](/reference/resource-properties/schema) but builds under an unexpected schema, here are some steps for debugging the issue. :::info @@ -12,10 +24,10 @@ You can also follow along via this video: -### 1. Search for a macro named `generate_schema_name` +## Search for a macro named `generate_schema_name` Do a file search to check if you have a macro named `generate_schema_name` in the `macros` directory of your project. -#### I do not have a macro named `generate_schema_name` in my project +### You do not have a macro named `generate_schema_name` in your project This means that you are using dbt's default implementation of the macro, as defined [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47C1-L60) ```sql @@ -37,15 +49,14 @@ This means that you are using dbt's default implementation of the macro, as defi Note that this logic is designed so that two dbt users won't accidentally overwrite each other's work by writing to the same schema. -#### I have a `generate_schema_name` macro in my project that calls another macro +### You have a `generate_schema_name` macro in a project that calls another macro If your `generate_schema_name` macro looks like so: ```sql {% macro generate_schema_name(custom_schema_name, node) -%} {{ generate_schema_name_for_env(custom_schema_name, node) }} {%- endmacro %} ``` -Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/include/global_project/macros/etc/get_custom_schema.sql#L43-L56). - +Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47-L60). ```sql {% macro generate_schema_name_for_env(custom_schema_name, node) -%} @@ -62,22 +73,22 @@ Your project is switching out the `generate_schema_name` macro for another macro {%- endmacro %} ``` -#### I have a `generate_schema_name` macro with custom logic +### You have a `generate_schema_name` macro with custom logic If this is the case — it might be a great idea to reach out to the person who added this macro to your project, as they will have context here — you can use [GitHub's blame feature](https://docs.github.com/en/free-pro-team@latest/github/managing-files-in-a-repository/tracking-changes-in-a-file) to do this. In all cases take a moment to read through the Jinja to see if you can follow the logic. -### 2. Confirm your `schema` config +## Confirm your `schema` config Check if you are using the [`schema` config](/reference/resource-properties/schema) in your model, either via a `{{ config() }}` block, or from `dbt_project.yml`. In both cases, dbt passes this value as the `custom_schema_name` parameter of the `generate_schema_name` macro. -### 3. Confirm your target values +## Confirm your target values Most `generate_schema_name` macros incorporate logic from the [`target` variable](/reference/dbt-jinja-functions/target), in particular `target.schema` and `target.name`. Use the docs [here](/reference/dbt-jinja-functions/target) to help you find the values of each key in this dictionary. -### 4. Put the two together +## Put the two together Now, re-read through the logic of your `generate_schema_name` macro, and mentally plug in your `customer_schema_name` and `target` values. @@ -87,7 +98,7 @@ You should find that the schema dbt is constructing for your model matches the o Note that snapshots do not follow this behavior, check out the docs on [target_schema](/reference/resource-configs/target_schema) instead. ::: -### 5. Adjust as necessary +## Adjust as necessary Now that you understand how a model's schema is being generated, you can adjust as necessary: - You can adjust the logic in your `generate_schema_name` macro (or add this macro to your project if you don't yet have one and adjust from there) diff --git a/website/docs/guides/dremio-lakehouse.md b/website/docs/guides/dremio-lakehouse.md new file mode 100644 index 00000000000..1c59c04d175 --- /dev/null +++ b/website/docs/guides/dremio-lakehouse.md @@ -0,0 +1,196 @@ +--- +title: Build a data lakehouse with dbt Core and Dremio Cloud +id: build-dremio-lakehouse +description: Learn how to build a data lakehouse with dbt Core and Dremio Cloud. +displayText: Build a data lakehouse with dbt Core and Dremio Cloud +hoverSnippet: Learn how to build a data lakehouse with dbt Core and Dremio Cloud +# time_to_complete: '30 minutes' commenting out until we test +platform: 'dbt-core' +icon: 'guides' +hide_table_of_contents: true +tags: ['Dremio', 'dbt Core'] +level: 'Intermediate' +recently_updated: true +--- +## Introduction + +This guide will demonstrate how to build a data lakehouse with dbt Core 1.5 or new and Dremio Cloud. You can simplify and optimize your data infrastructure with dbt's robust transformation framework and Dremio’s open and easy data lakehouse. The integrated solution empowers companies to establish a strong data and analytics foundation, fostering self-service analytics and enhancing business insights while simplifying operations by eliminating the necessity to write complex Extract, Transform, and Load (ETL) pipelines. + +### Prerequisites + +* You must have a [Dremio Cloud](https://docs.dremio.com/cloud/) account. +* You must have Python 3 installed. +* You must have dbt Core v1.5 or newer [installed](/docs/core/installation). +* You must have the Dremio adapter 1.5.0 or newer [installed and configured](/docs/core/connect-data-platform/dremio-setup) for Dremio Cloud. +* You must have basic working knowledge of Git and the command line interface (CLI). + +## Validate your environment + +Validate your environment by running the following commands in your CLI and verifying the results: + +```shell + +$ python3 --version +Python 3.11.4 # Must be Python 3 + +``` + +```shell + +$ dbt --version +Core: + - installed: 1.5.0 # Must be 1.5 or newer + - latest: 1.6.3 - Update available! + + Your version of dbt-core is out of date! + You can find instructions for upgrading here: + https://docs.getdbt.com/docs/installation + +Plugins: + - dremio: 1.5.0 - Up to date! # Must be 1.5 or newer + +``` + +## Getting started + +1. Clone the Dremio dbt Core sample project from the [GitHub repo](https://github.com/dremio-brock/DremioDBTSample/tree/master/dremioSamples). + +2. In your integrated development environment (IDE), open the relation.py file in the Dremio adapter directory: + `$HOME/Library/Python/3.9/lib/python/site-packages/dbt/adapters/dremio/relation.py` + +3. Find and update lines 51 and 52 to match the following syntax: + +```python + +PATTERN = re.compile(r"""((?:[^."']|"[^"]*"|'[^']*')+)""") +return ".".join(PATTERN.split(identifier)[1::2]) + +``` + +The complete selection should look like this: + +```python +def quoted_by_component(self, identifier, componentName): + if componentName == ComponentName.Schema: + PATTERN = re.compile(r"""((?:[^."']|"[^"]*"|'[^']*')+)""") + return ".".join(PATTERN.split(identifier)[1::2]) + else: + return self.quoted(identifier) + +``` + +You need to update this pattern because the plugin doesn’t support schema names in Dremio containing dots and spaces. + +## Build your pipeline + +1. Create a `profiles.yml` file in the `$HOME/.dbt/profiles.yml` path and add the following configs: + +```yaml + +dremioSamples: + outputs: + cloud_dev: + dremio_space: dev + dremio_space_folder: no_schema + object_storage_path: dev + object_storage_source: $scratch + pat: + cloud_host: api.dremio.cloud + cloud_project_id: + threads: 1 + type: dremio + use_ssl: true + user: + target: dev + + ``` + + 2. Execute the transformation pipeline: + + ```shell + + $ dbt run -t cloud_dev + + ``` + + If the above configurations have been implemented, the output will look something like this: + +```shell + +17:24:16 Running with dbt=1.5.0 +17:24:17 Found 5 models, 0 tests, 0 snapshots, 0 analyses, 348 macros, 0 operations, 0 seed files, 2 sources, 0 exposures, 0 metrics, 0 groups +17:24:17 +17:24:29 Concurrency: 1 threads (target='cloud_dev') +17:24:29 +17:24:29 1 of 5 START sql view model Preparation.trips .................................. [RUN] +17:24:31 1 of 5 OK created sql view model Preparation. trips ............................. [OK in 2.61s] +17:24:31 2 of 5 START sql view model Preparation.weather ................................ [RUN] +17:24:34 2 of 5 OK created sql view model Preparation.weather ........................... [OK in 2.15s] +17:24:34 3 of 5 START sql view model Business.Transportation.nyc_trips .................. [RUN] +17:24:36 3 of 5 OK created sql view model Business.Transportation.nyc_trips ............. [OK in 2.18s] +17:24:36 4 of 5 START sql view model Business.Weather.nyc_weather ....................... [RUN] +17:24:38 4 of 5 OK created sql view model Business.Weather.nyc_weather .................. [OK in 2.09s] +17:24:38 5 of 5 START sql view model Application.nyc_trips_with_weather ................. [RUN] +17:24:41 5 of 5 OK created sql view model Application.nyc_trips_with_weather ............ [OK in 2.74s] +17:24:41 +17:24:41 Finished running 5 view models in 0 hours 0 minutes and 24.03 seconds (24.03s). +17:24:41 +17:24:41 Completed successfully +17:24:41 +17:24:41 Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5 + +``` + +Now that you have a running environment and a completed job, you can view the data in Dremio and expand your code. This is a snapshot of the project structure in an IDE: + + + +## About the schema.yml + +The `schema.yml` file defines Dremio sources and models to be used and what data models are in scope. In this guides sample project, there are two data sources: + +1. The `NYC-weather.csv` stored in the **Samples** database and +2. The `sample_data` from the **Samples database**. + +The models correspond to both weather and trip data respectively and will be joined for analysis. + +The sources can be found by navigating to the **Object Storage** section of the Dremio Cloud UI. + + + +## About the models + +**Preparation** — `preparation_trips.sql` and `preparation_weather.sql` are building views on top of the trips and weather data. + +**Business** — `business_transportation_nyc_trips.sql` applies some level of transformation on `preparation_trips.sql` view. `Business_weather_nyc.sql` has no transformation on the `preparation_weather.sql` view. + +**Application** — `application_nyc_trips_with_weather.sql` joins the output from the Business model. This is what your business users will consume. + +## The Job output + +When you run the dbt job, it will create a **dev** space folder that has all the data assets created. This is what you will see in Dremio Cloud UI. Spaces in Dremio is a way to organize data assets which map to business units or data products. + + + +Open the **Application folder** and you will see the output of the simple transformation we did using dbt. + + + +## Query the data + +Now that you have run the job and completed the transformation, it's time to query your data. Click on the `nyc_trips_with_weather` view. That will take you to the SQL Runner page. Click **Show SQL Pane** on the upper right corner of the page. + +Run the following query: + +```sql + +SELECT vendor_id, + AVG(tip_amount) +FROM dev.application."nyc_treips_with_weather" +GROUP BY vendor_id + +``` + + + +This completes the integration setup and data is ready for business consumption. \ No newline at end of file diff --git a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md b/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md similarity index 94% rename from website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md rename to website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md index 692106655ac..30221332355 100644 --- a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md +++ b/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md @@ -4,19 +4,27 @@ id: how-to-use-databricks-workflows-to-run-dbt-cloud-jobs description: Learn how to use Databricks workflows to run dbt Cloud jobs displayText: "Use Databricks workflows to run dbt Cloud jobs" hoverSnippet: Learn how to use Databricks workflows to run dbt Cloud jobs +# time_to_complete: '30 minutes' commenting out until we test +icon: 'databricks' +hide_table_of_contents: true +tags: ['Databricks', 'dbt Core','dbt Cloud','Orchestration'] +level: 'Intermediate' +recently_updated: true --- +## Introduction + Using Databricks workflows to call the dbt Cloud job API can be useful for several reasons: 1. **Integration with other ETL processes** — If you're already running other ETL processes in Databricks, you can use a Databricks workflow to trigger a dbt Cloud job after those processes are done. -2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/dbt-cloud-job). +2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/deploy-jobs). 3. [**Separation of concerns —**](https://en.wikipedia.org/wiki/Separation_of_concerns) Detailed logs for dbt jobs in the dbt Cloud environment can lead to more modularity and efficient debugging. By doing so, it becomes easier to isolate bugs quickly while still being able to see the overall status in Databricks. 4. **Custom job triggering —** Use a Databricks workflow to trigger dbt Cloud jobs based on custom conditions or logic that aren't natively supported by dbt Cloud's scheduling feature. This can give you more flexibility in terms of when and how your dbt Cloud jobs run. -## Prerequisites +### Prerequisites - Active [Teams or Enterprise dbt Cloud account](https://www.getdbt.com/pricing/) -- You must have a configured and existing [dbt Cloud job](/docs/deploy/dbt-cloud-job) +- You must have a configured and existing [dbt Cloud deploy job](/docs/deploy/deploy-jobs) - Active Databricks account with access to [Data Science and Engineering workspace](https://docs.databricks.com/workspace-index.html) and [Manage secrets](https://docs.databricks.com/security/secrets/index.html) - [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) - **Note**: You only need to set up your authentication. Once you have set up your Host and Token and are able to run `databricks workspace ls /Users/`, you can proceed with the rest of this guide. @@ -29,7 +37,7 @@ To use Databricks workflows for running dbt Cloud jobs, you need to perform the - [Create a Databricks Python notebook](#create-a-databricks-python-notebook) - [Configure the workflows to run the dbt Cloud jobs](#configure-the-workflows-to-run-the-dbt-cloud-jobs) -### Set up a Databricks secret scope +## Set up a Databricks secret scope 1. Retrieve **[User API Token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens#user-api-tokens) **or **[Service Account Token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#generating-service-account-tokens) **from dbt Cloud 2. Set up a **Databricks secret scope**, which is used to securely store your dbt Cloud API key. @@ -47,7 +55,7 @@ databricks secrets put --scope --key --s 5. Replace **``** with the actual API key value that you copied from dbt Cloud in step 1. -### Create a Databricks Python notebook +## Create a Databricks Python notebook 1. [Create a **Databricks Python notebook**](https://docs.databricks.com/notebooks/notebooks-manage.html), which executes a Python script that calls the dbt Cloud job API. @@ -165,7 +173,7 @@ DbtJobRunStatus.SUCCESS You can cancel the job from dbt Cloud if necessary. ::: -### Configure the workflows to run the dbt Cloud jobs +## Configure the workflows to run the dbt Cloud jobs You can set up workflows directly from the notebook OR by adding this notebook to one of your existing workflows: @@ -206,6 +214,4 @@ You can set up workflows directly from the notebook OR by adding this notebook t Multiple Workflow tasks can be set up using the same notebook by configuring the `job_id` parameter to point to different dbt Cloud jobs. -## Closing - Using Databricks workflows to access the dbt Cloud job API can improve integration of your data pipeline processes and enable scheduling of more complex workflows. diff --git a/website/docs/guides/legacy/creating-date-partitioned-tables.md b/website/docs/guides/legacy/creating-date-partitioned-tables.md deleted file mode 100644 index 8c461dbe4a8..00000000000 --- a/website/docs/guides/legacy/creating-date-partitioned-tables.md +++ /dev/null @@ -1,117 +0,0 @@ ---- -title: "BigQuery: Creating date-partitioned tables" -id: "creating-date-partitioned-tables" ---- - - -:::caution Deprecated - -The functionality described below was introduced in dbt Core v0.10 (March 2018). In v1.0 (December 2021), it was deprecated in favor of [column-based partitioning](/reference/resource-configs/bigquery-configs#partition-clause) and [incremental modeling](/docs/build/incremental-models). - -::: - -dbt supports the creation of [date partitioned tables](https://cloud.google.com/bigquery/docs/partitioned-tables) in BigQuery. - -To configure a dbt model as a date partitioned , use the `materialized='table'` model configuration in conjunction with a list of `partitions`. dbt will execute your model query once for each specified partition. For example: - - - -```sql -{{ - config( - materialized='table', - partitions=[20180101, 20180102], - verbose=True - ) -}} - -/* - -dbt will interpolate each `partition` wherever it finds [DBT__PARTITION_DATE] -in your model code. This model will create a single table with two partitions: - 1. 20180101 - 2. 20180102 - -These partitions will be created by running the following query against -each of the following date-sharded tables: - - 1. `snowplow`.`events_20180101` - 2. `snowplow`.`events_20180102` - -*/ - -select * -from `snowplow`.`events_[DBT__PARTITION_DATE]` -``` - - - -To make this model more dynamic, we can use the `dbt.partition_range` macro to generate a list of 8-digit dates in a specified range. Further, dbt provides a handy macro, `date_sharded_table`, for getting a date-sharded by its prefix for a given date. Together, this looks like: - - - -```sql -{{ - config( - materialized='table', - partitions=dbt.partition_range('20180101, 20180201'), - verbose=True - ) -}} - --- This model creates a date-partitioned table. There will be one --- partition for each day between 20180101 and 20180201, inclusive. --- The `date_sharded_table` macro below is sugar around [DBT__PARTITION_DATE] - -select * -from `snowplow`.`{{ date_sharded_table('events_') }}` -``` - - - -Finally, it's frequently desirable to only update a date partitioned table for the last day of received data. This can be implemented using the above configurations in conjunction with a clever macro and some [command line variables](/docs/build/project-variables). - -First, the macro: - - - -```sql -{% macro yesterday() %} - - {% set today = modules.datetime.date.today() %} - {% set one_day = modules.datetime.timedelta(days=1) %} - {% set yesterday = (today - one_day) %} - - {{ return(yesterday.strftime("%Y%m%d")) }} - -{% endmacro %} -``` - - - -Next, use it in the model: - - - -```sql -{{ - config( - materialized='table', - partitions=dbt.partition_range(var('dates', default=yesterday())), - verbose=True - ) -}} - -select * -from `snowplow`.`{{ date_sharded_table('events_') }}` -``` - - - -If a `dates` variable is provided (eg. on the command line with `--vars`), then dbt will create the partitions for that date range. Otherwise, dbt will create a partition for `yesterday`, overwriting it if it already exists. - -Here's an example of running this model for the first 3 days of 2018 as a part of a backfill: - -``` -dbt run --select partitioned_yesterday --vars 'dates: "20180101, 20180103"' -``` diff --git a/website/docs/guides/legacy/videos.md b/website/docs/guides/legacy/videos.md deleted file mode 100644 index 863029ff6d9..00000000000 --- a/website/docs/guides/legacy/videos.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: "Videos 🎥" -id: "videos" ---- - -Check out some cool videos about using and deploying dbt! - -## dbt tutorial (February, 2017) - - - -## dbt docs demo with GitLab (September, 2018) - diff --git a/website/docs/guides/manual-install-qs.md b/website/docs/guides/manual-install-qs.md new file mode 100644 index 00000000000..61796fe008a --- /dev/null +++ b/website/docs/guides/manual-install-qs.md @@ -0,0 +1,468 @@ +--- +title: "Quickstart for dbt Core from a manual install" +id: manual-install +description: "Connecting your warehouse to dbt Core using the CLI." +level: 'Beginner' +platform: 'dbt-core' +icon: 'fa-light fa-square-terminal' +tags: ['dbt Core','Quickstart'] +hide_table_of_contents: true +--- +## Introduction + +When you use dbt Core to work with dbt, you will be editing files locally using a code editor, and running projects using a command line interface (CLI). If you'd rather edit files and run projects using the web-based Integrated Development Environment (IDE), you should refer to the [dbt Cloud quickstarts](/guides). You can also develop and run dbt commands using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) — a dbt Cloud powered command line. + +### Prerequisites + +* To use dbt Core, it's important that you know some basics of the Terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily. +* Install dbt Core using the [installation instructions](/docs/core/installation) for your operating system. +* Complete [Setting up (in BigQuery)](/guides/bigquery?step=2) and [Loading data (BigQuery)](/guides/bigquery?step=3). +* [Create a GitHub account](https://github.com/join) if you don't already have one. + +### Create a starter project + +After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models. + +## Create a repository + +The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join). + +1. [Create a new GitHub repository](https://github.com/new) named `dbt-tutorial`. +2. Select **Public** so the repository can be shared with others. You can always make it private later. +3. Leave the default values for all other settings. +4. Click **Create repository**. +5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes). + +## Create a project + +Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project. + +To create your dbt project: + +1. Make sure you have dbt Core installed and check the version using the `dbt --version` command: + +```shell +dbt --version +``` + +2. Initiate the `jaffle_shop` project using the `init` command: + +```shell +dbt init jaffle_shop +``` + +3. Navigate into your project's directory: + +```shell +cd jaffle_shop +``` + +4. Use `pwd` to confirm that you are in the right spot: + +```shell +$ pwd +> Users/BBaggins/dbt-tutorial/jaffle_shop +``` + +5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command. + +
      + +
      + +6. Update the following values in the `dbt_project.yml` file: + + + +```yaml +name: jaffle_shop # Change from the default, `my_new_project` + +... + +profile: jaffle_shop # Change from the default profile name, `default` + +... + +models: + jaffle_shop: # Change from `my_new_project` to match the previous value for `name:` + ... +``` + + + +## Connect to BigQuery + +When developing locally, dbt connects to your using a [profile](/docs/core/connect-data-platform/connection-profiles), which is a YAML file with all the connection details to your warehouse. + +1. Create a file in the `~/.dbt/` directory named `profiles.yml`. +2. Move your BigQuery keyfile into this directory. +3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted. + + + +```yaml +jaffle_shop: # this needs to match the profile in your dbt_project.yml file + target: dev + outputs: + dev: + type: bigquery + method: service-account + keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile + project: grand-highway-265418 # Replace this with your project id + dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo + threads: 1 + timeout_seconds: 300 + location: US + priority: interactive +``` + + + +4. Run the `debug` command from your project to confirm that you can successfully connect: + +```shell +$ dbt debug +> Connection test: OK connection ok +``` + +
      + +
      + +### FAQs + + + + + + + +## Perform your first dbt run + +Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order. + +1. Enter the `run` command to build example models: + +```shell +dbt run +``` + +You should have an output that looks like this: + +
      + +
      + +## Commit your changes + +Commit your changes so that the repository contains the latest code. + +1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository). + +```shell +git init +git branch -M main +git add . +git commit -m "Create a dbt project" +git remote add origin https://github.com/USERNAME/dbt-tutorial.git +git push -u origin main +``` + +2. Return to your GitHub repository to verify your new files have been added. + +### Build your first models + +Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! +In the next steps, you will take a sample query and turn it into a model in your dbt project. + +## Checkout a new git branch + +Check out a new git branch to work on new code: + +1. Create a new branch by using the `checkout` command and passing the `-b` flag: + +```shell +$ git checkout -b add-customers-model +> Switched to a new branch `add-customer-model` +``` + +## Build your first model + + +1. Open your project in your favorite code editor. +2. Create a new SQL file in the `models` directory, named `models/customers.sql`. +3. Paste the following query into the `models/customers.sql` file. + + + +4. From the command line, enter `dbt run`. +
      + +
      + +When you return to the BigQuery console, you can `select` from this model. + +### FAQs + + + + + + + +## Change the way your model is materialized + + + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + +
      + + + +```sql +select + id as customer_id, + first_name, + last_name + +from `dbt-tutorial`.jaffle_shop.customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from `dbt-tutorial`.jaffle_shop.orders +``` + + + +
      + +
      + + + +```sql +select + id as customer_id, + first_name, + last_name + +from jaffle_shop_customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from jaffle_shop_orders +``` + + + +
      + +
      + + + +```sql +select + id as customer_id, + first_name, + last_name + +from jaffle_shop.customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from jaffle_shop.orders +``` + + + +
      + +
      + + + +```sql +select + id as customer_id, + first_name, + last_name + +from raw.jaffle_shop.customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from raw.jaffle_shop.orders +``` + + + +
      + +
      + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + +```sql +with customers as ( + + select * from {{ ref('stg_customers') }} + +), + +orders as ( + + select * from {{ ref('stg_orders') }} + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final + +``` + + + +4. Execute `dbt run`. + +This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +### FAQs {#faq-2} + + + + + +### Next steps + + + +You can also explore: + +* The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL. +* The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the select statements that are running and the python logging happening when dbt runs. + +## Add tests to your models + + + +## Document your models + + + +3. Run `dbt docs serve` command to launch the documentation in a local website. + +#### FAQs + + + + + +#### Next steps + + + +## Commit updated changes + +You need to commit the changes you made to the project so that the repository has your latest code. + +1. Add all your changes to git: `git add -A` +2. Commit your changes: `git commit -m "Add customers model, tests, docs"` +3. Push your changes to your repository: `git push` +4. Navigate to your repository, and open a pull request to merge the code into your master branch. + +## Schedule a job + +We recommend using dbt Cloud as the easiest and most reliable way to [deploy jobs](/docs/deploy/deployments) and automate your dbt project in production. + +For more info on how to get started, refer to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs). + + + +For more information about using dbt Core to schedule a job, refer [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post. diff --git a/website/docs/guides/microsoft-fabric-qs.md b/website/docs/guides/microsoft-fabric-qs.md new file mode 100644 index 00000000000..c7c53a2aac7 --- /dev/null +++ b/website/docs/guides/microsoft-fabric-qs.md @@ -0,0 +1,314 @@ +--- +title: "Quickstart for dbt Cloud and Microsoft Fabric" +id: "microsoft-fabric" +level: 'Beginner' +icon: 'fabric' +hide_table_of_contents: true +tags: ['dbt Cloud','Quickstart'] +recently_updated: true +--- +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with Microsoft Fabric. It will show you how to: + +- Load the Jaffle Shop sample data (provided by dbt Labs) into your Microsoft Fabric warehouse. +- Connect dbt Cloud to Microsoft Fabric. +- Turn a sample query into a model in your dbt project. A model in dbt is a SELECT statement. +- Add tests to your models. +- Document your models. +- Schedule a job to run. + +:::tip Public preview + +A public preview of Microsoft Fabric in dbt Cloud is now available! + +::: + +### Prerequisites +- You have a [dbt Cloud](https://www.getdbt.com/signup/) account. +- You have started the Microsoft Fabric (Preview) trial. For details, refer to [Microsoft Fabric (Preview) trial](https://learn.microsoft.com/en-us/fabric/get-started/fabric-trial) in the Microsoft docs. +- As a Microsoft admin, you’ve enabled service principal authentication. For details, refer to [Enable service principal authentication](https://learn.microsoft.com/en-us/fabric/admin/metadata-scanning-enable-read-only-apis) in the Microsoft docs. dbt Cloud needs these authentication credentials to connect to Microsoft Fabric. + +### Related content +- [dbt Courses](https://courses.getdbt.com/collections) +- [About continuous integration jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + +## Load data into your Microsoft Fabric warehouse + +1. Log in to your [Microsoft Fabric](http://app.fabric.microsoft.com) account. +2. On the home page, select the **Synapse Data Warehouse** tile. + + + +3. From **Workspaces** on the left sidebar, navigate to your organization’s workspace. Or, you can create a new workspace; refer to [Create a workspace](https://learn.microsoft.com/en-us/fabric/get-started/create-workspaces) in the Microsoft docs for more details. +4. Choose your warehouse from the table. Or, you can create a new warehouse; refer to [Create a warehouse](https://learn.microsoft.com/en-us/fabric/data-warehouse/tutorial-create-warehouse) in the Microsoft docs for more details. +5. Open the SQL editor by selecting **New SQL query** from the top bar. +6. Copy these statements into the SQL editor to load the Jaffle Shop example data: + + ```sql + DROP TABLE dbo.customers; + + CREATE TABLE dbo.customers + ( + [ID] [int], + [FIRST_NAME] [varchar] (8000), + [LAST_NAME] [varchar] (8000) + ); + + COPY INTO [dbo].[customers] + FROM 'https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/jaffle_shop_customers.parquet' + WITH ( + FILE_TYPE = 'PARQUET' + ); + + DROP TABLE dbo.orders; + + CREATE TABLE dbo.orders + ( + [ID] [int], + [USER_ID] [int], + -- [ORDER_DATE] [int], + [ORDER_DATE] [date], + [STATUS] [varchar] (8000) + ); + + COPY INTO [dbo].[orders] + FROM 'https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/jaffle_shop_orders.parquet' + WITH ( + FILE_TYPE = 'PARQUET' + ); + + DROP TABLE dbo.payments; + + CREATE TABLE dbo.payments + ( + [ID] [int], + [ORDERID] [int], + [PAYMENTMETHOD] [varchar] (8000), + [STATUS] [varchar] (8000), + [AMOUNT] [int], + [CREATED] [date] + ); + + COPY INTO [dbo].[payments] + FROM 'https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/stripe_payments.parquet' + WITH ( + FILE_TYPE = 'PARQUET' + ); + ``` + + + +## Connect dbt Cloud to Microsoft Fabric + +1. Create a new project in dbt Cloud. From **Account settings** (using the gear menu in the top right corner), click **+ New Project**. +2. Enter a project name and click **Continue**. +3. Choose **Fabric** as your connection and click **Next**. +4. In the **Configure your environment** section, enter the **Settings** for your new project: +5. Enter the **Development credentials** for your new project: + - **Authentication** — Choose **Service Principal** from the dropdown. + - **Tenant ID** — Use the service principal’s **Directory (tenant) id** as the value. + - **Client ID** — Use the service principal’s **application (client) ID id** as the value. + - **Client secret** — Use the service principal’s **client secret** (not the **client secret id**) as the value. +6. Click **Test connection**. This verifies that dbt Cloud can access your Microsoft Fabric account. +7. Click **Next** when the test succeeds. If it failed, you might need to check your Microsoft service principal. + +## Set up a dbt Cloud managed repository + + +## Initialize your dbt project​ and start developing +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize dbt project**. This builds out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` and click **Commit**. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model +1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode. +1. Click the **...** next to the `models` directory, then select **Create file**. +1. Name the file `customers.sql`, then click **Create**. +1. Copy the following query into the file and click **Save**. + + + + ```sql + with customers as ( + + select + ID as customer_id, + FIRST_NAME as first_name, + LAST_NAME as last_name + + from dbo.customers + ), + + orders as ( + + select + ID as order_id, + USER_ID as customer_id, + ORDER_DATE as order_date, + STATUS as status + + from dbo.orders + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by customer_id + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders on customers.customer_id = customer_orders.customer_id + ) + + select * from final + ``` + + +1. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. + +#### FAQs + + + + + + + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + + ```sql + select + ID as customer_id, + FIRST_NAME as first_name, + LAST_NAME as last_name + + from dbo.customers + ``` + + + + + + ```sql + select + ID as order_id, + USER_ID as customer_id, + ORDER_DATE as order_date, + STATUS as status + + from dbo.orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by customer_id + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders on customers.customer_id = customer_orders.customer_id + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +#### FAQs {#faq-2} + + + + + + + + \ No newline at end of file diff --git a/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md b/website/docs/guides/migrate-from-spark-to-databricks.md similarity index 76% rename from website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md rename to website/docs/guides/migrate-from-spark-to-databricks.md index f5549c58416..8fb02ae79d7 100644 --- a/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md +++ b/website/docs/guides/migrate-from-spark-to-databricks.md @@ -1,18 +1,34 @@ --- -title: "Migrating from dbt-spark to dbt-databricks" -id: "migrating-from-spark-to-databricks" +title: "Migrate from dbt-spark to dbt-databricks" +id: "migrate-from-spark-to-databricks" +description: Learn how to migrate from dbt-spark to dbt-databricks. +displayText: Migrate from Spark to Databricks +hoverSnippet: Learn how to migrate from dbt-spark to dbt-databricks. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Migration', 'dbt Core','dbt Cloud'] +level: 'Intermediate' +recently_updated: true --- -You can [migrate your projects](#migrate-your-dbt-projects) from using the `dbt-spark` adapter to using the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks). In collaboration with dbt Labs, Databricks built this adapter using dbt-spark as the foundation and added some critical improvements. With it, you get an easier set up — requiring only three inputs for authentication — and more features such as support for [Unity Catalog](https://www.databricks.com/product/unity-catalog). +## Introduction -## Simpler authentication +You can migrate your projects from using the `dbt-spark` adapter to using the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks). In collaboration with dbt Labs, Databricks built this adapter using dbt-spark as the foundation and added some critical improvements. With it, you get an easier set up — requiring only three inputs for authentication — and more features such as support for [Unity Catalog](https://www.databricks.com/product/unity-catalog). + +### Prerequisites + +- Your project must be compatible with dbt 1.0 or greater. Refer to [Upgrading to v1.0](/docs/dbt-versions/core-upgrade/upgrading-to-v1.0) for details. For the latest version of dbt, refer to [Upgrading to v1.7](/docs/dbt-versions/core-upgrade/upgrading-to-v1.7). +- For dbt Cloud, you need administrative (admin) privileges to migrate dbt projects. + +### Simpler authentication Previously, you had to provide a `cluster` or `endpoint` ID which was hard to parse from the `http_path` that you were given. Now, it doesn't matter if you're using a cluster or an SQL endpoint because the [dbt-databricks setup](/docs/core/connect-data-platform/databricks-setup) requires the _same_ inputs for both. All you need to provide is: - hostname of the Databricks workspace - HTTP path of the Databricks SQL warehouse or cluster - appropriate credentials -## Better defaults +### Better defaults The `dbt-databricks` adapter provides better defaults than `dbt-spark` does. The defaults help optimize your workflow so you can get the fast performance and cost-effectiveness of Databricks. They are: @@ -24,24 +40,14 @@ With dbt-spark, however, the default for `incremental_strategy` is `append`. If For more information on defaults, see [Caveats](/docs/core/connect-data-platform/databricks-setup#caveats). -## Pure Python +### Pure Python If you use dbt Core, you no longer have to download an independent driver to interact with Databricks. The connection information is all embedded in a pure-Python library called `databricks-sql-connector`. -## Migrate your dbt projects - -In both dbt Core and dbt Cloud, you can migrate your projects to the Databricks-specific adapter from the generic Apache Spark adapter. - -### Prerequisites - -- Your project must be compatible with dbt 1.0 or greater. Refer to [Upgrading to v1.0](/guides/migration/versions/upgrading-to-v1.0) for details. For the latest version of dbt, refer to [Upgrading to v1.3](/guides/migration/versions/upgrading-to-v1.3). -- For dbt Cloud, you need administrative (admin) privileges to migrate dbt projects. - - - +## Migrate your dbt projects in dbt Cloud - +You can migrate your projects to the Databricks-specific adapter from the generic Apache Spark adapter. If you're using dbt Core, then skip to Step 4. The migration to the `dbt-databricks` adapter from `dbt-spark` shouldn't cause any downtime for production jobs. dbt Labs recommends that you schedule the connection change when usage of the IDE is light to avoid disrupting your team. @@ -60,7 +66,7 @@ To update your Databricks connection in dbt Cloud: Everyone in your organization who uses dbt Cloud must refresh the IDE before starting work again. It should refresh in less than a minute. -#### About your credentials +## Configure your credentials When you update the Databricks connection in dbt Cloud, your team will not lose their credentials. This makes migrating easier since it only requires you to delete the Databricks connection and re-add the cluster or endpoint information. @@ -70,9 +76,7 @@ These credentials will not get lost when there's a successful connection to Data - The personal access tokens your team added in their dbt Cloud profile so they can develop in the IDE for a given project. - The access token you added for each deployment environment so dbt Cloud can connect to Databricks during production jobs. - - - +## Migrate dbt projects in dbt Core To migrate your dbt Core projects to the `dbt-databricks` adapter from `dbt-spark`, you: 1. Install the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks) in your environment @@ -80,13 +84,8 @@ To migrate your dbt Core projects to the `dbt-databricks` adapter from `dbt-spar Anyone who's using your project must also make these changes in their environment. - - - - - -### Examples +## Try these examples You can use the following examples of the `profiles.yml` file to see the authentication setup with `dbt-spark` compared to the simpler setup with `dbt-databricks` when connecting to an SQL endpoint. A cluster example would look similar. diff --git a/website/docs/guides/migrate-from-stored-procedures.md b/website/docs/guides/migrate-from-stored-procedures.md new file mode 100644 index 00000000000..c894bce9873 --- /dev/null +++ b/website/docs/guides/migrate-from-stored-procedures.md @@ -0,0 +1,377 @@ +--- +title: Migrate from DDL, DML, and stored procedures +id: migrate-from-stored-procedures +description: Learn how to transform from a historical codebase of mixed DDL and DML statements to dbt models, including tips and patterns for the shift from a procedural to a declarative approach in defining datasets. +displayText: Migrate from DDL, DML, and stored procedures +hoverSnippet: Learn how to transform from a historical codebase of mixed DDL and DML statements to dbt models +# time_to_complete: '30 minutes' commenting out until we test +platform: 'dbt-core' +icon: 'guides' +hide_table_of_contents: true +tags: ['Migration', 'dbt Core'] +level: 'Beginner' +recently_updated: true +--- + +## Introduction + +One of the more common situations that new dbt adopters encounter is a historical codebase of transformations written as a hodgepodge of DDL and DML statements, or stored procedures. Going from DML statements to dbt models is often a challenging hump for new users to get over, because the process involves a significant paradigm shift between a procedural flow of building a dataset (e.g. a series of DDL and DML statements) to a declarative approach to defining a dataset (e.g. how dbt uses SELECT statements to express data models). This guide aims to provide tips, tricks, and common patterns for converting DML statements to dbt models. + +### Preparing to migrate + +Before getting into the meat of conversion, it’s worth noting that DML statements will not always illustrate a comprehensive set of columns and column types that an original table might contain. Without knowing the DDL to create the table, it’s impossible to know precisely if your conversion effort is apples-to-apples, but you can generally get close. + +If your supports `SHOW CREATE TABLE`, that can be a quick way to get a comprehensive set of columns you’ll want to recreate. If you don’t have the DDL, but are working on a substantial stored procedure, one approach that can work is to pull column lists out of any DML statements that modify the table, and build up a full set of the columns that appear. + +As for ensuring that you have the right column types, since models materialized by dbt generally use `CREATE TABLE AS SELECT` or `CREATE VIEW AS SELECT` as the driver for object creation, tables can end up with unintended column types if the queries aren’t explicit. For example, if you care about `INT` versus `DECIMAL` versus `NUMERIC`, it’s generally going to be best to be explicit. The good news is that this is easy with dbt: you just cast the column to the type you intend. + +We also generally recommend that column renaming and type casting happen as close to the source tables as possible, typically in a layer of staging transformations, which helps ensure that future dbt modelers will know where to look for those transformations! See [How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) for more guidance on overall project structure. + +### Operations we need to map + +There are four primary DML statements that you are likely to have to convert to dbt operations while migrating a procedure: + +- `INSERT` +- `UPDATE` +- `DELETE` +- `MERGE` + +Each of these can be addressed using various techniques in dbt. Handling `MERGE`s is a bit more involved than the rest, but can be handled effectively via dbt. The first three, however, are fairly simple to convert. + +## Map INSERTs + +An `INSERT` statement is functionally the same as using dbt to `SELECT` from an existing source or other dbt model. If you are faced with an `INSERT`-`SELECT` statement, the easiest way to convert the statement is to just create a new dbt model, and pull the `SELECT` portion of the `INSERT` statement out of the procedure and into the model. That’s basically it! + +To really break it down, let’s consider a simple example: + +```sql +INSERT INTO returned_orders (order_id, order_date, total_return) + +SELECT order_id, order_date, total FROM orders WHERE type = 'return' +``` + +Converting this with a first pass to a [dbt model](/guides/bigquery?step=8) (in a file called returned_orders.sql) might look something like: + +```sql +SELECT + order_id as order_id, + order_date as order_date, + total as total_return + +FROM {{ ref('orders') }} + +WHERE type = 'return' +``` + +Functionally, this would create a model (which could be materialized as a table or view depending on needs) called `returned_orders` that contains three columns: `order_id`, `order_date`, `total_return`) predicated on the type column. It achieves the same end as the `INSERT`, just in a declarative fashion, using dbt. + +### **A note on `FROM` clauses** + +In dbt, using a hard-coded table or view name in a `FROM` clause is one of the most serious mistakes new users make. dbt uses the ref and source macros to discover the ordering that transformations need to execute in, and if you don’t use them, you’ll be unable to benefit from dbt’s built-in lineage generation and pipeline execution. In the sample code throughout the remainder of this article, we’ll use ref statements in the dbt-converted versions of SQL statements, but it is an exercise for the reader to ensure that those models exist in their dbt projects. + +### **Sequential `INSERT`s to an existing table can be `UNION ALL`’ed together** + +Since dbt models effectively perform a single `CREATE TABLE AS SELECT` (or if you break it down into steps, `CREATE`, then an `INSERT`), you may run into complexities if there are multiple `INSERT` statements in your transformation that all insert data into the same table. Fortunately, this is a simple thing to handle in dbt. Effectively, the logic is performing a `UNION ALL` between the `INSERT` queries. If I have a transformation flow that looks something like (ignore the contrived nature of the scenario): + +```sql +CREATE TABLE all_customers + +INSERT INTO all_customers SELECT * FROM us_customers + +INSERT INTO all_customers SELECT * FROM eu_customers +``` + +The dbt-ified version of this would end up looking something like: + +```sql +SELECT * FROM {{ ref('us_customers') }} + +UNION ALL + +SELECT * FROM {{ ref('eu_customers') }} +``` + +The logic is functionally equivalent. So if there’s another statement that `INSERT`s into a model that I’ve already created, I can just add that logic into a second `SELECT` statement that is just `UNION ALL`'ed with the first. Easy! + +## Map UPDATEs + +`UPDATE`s start to increase the complexity of your transformations, but fortunately, they’re pretty darn simple to migrate, as well. The thought process that you go through when translating an `UPDATE` is quite similar to how an `INSERT` works, but the logic for the `SELECT` list in the dbt model is primarily sourced from the content in the `SET` section of the `UPDATE` statement. Let’s look at a simple example: + +```sql +UPDATE orders + +SET type = 'return' + +WHERE total < 0 +``` + +The way to look at this is similar to an `INSERT`-`SELECT` statement. The table being updated is the model you want to modify, and since this is an `UPDATE`, that model has likely already been created, and you can either: + +- add to it with subsequent transformations +- create an intermediate model that builds off of the original model – perhaps naming it something like `int_[entity]_[verb].sql`. + +The `SELECT` list should contain all of the columns for the table, but for the specific columns being updated by the DML, you’ll use the computation on the right side of the equals sign as the `SELECT`ed value. Then, you can use the target column name on the left of the equals sign as the column alias. + +If I were building an intermediate transformation from the above query would translate to something along the lines of: + +```sql +SELECT + CASE + WHEN total < 0 THEN 'return' + ELSE type + END AS type, + + order_id, + order_date + +FROM {{ ref('stg_orders') }} +``` + +Since the `UPDATE` statement doesn’t modify every value of the type column, we use a `CASE` statement to apply the contents’ `WHERE` clause. We still want to select all of the columns that should end up in the target table. If we left one of the columns out, it wouldn’t be passed through to the target table at all due to dbt’s declarative approach. + +Sometimes, you may not be sure what all the columns are in a table, or in the situation as above, you’re only modifying a small number of columns relative to the total number of columns in the table. It can be cumbersome to list out every column in the table, but fortunately dbt contains some useful utility macros that can help list out the full column list of a table. + +Another way I could have written the model a bit more dynamically might be: + +```sql +SELECT + {{ dbt_utils.star(from=ref('stg_orders'), except=['type']) }}, + CASE + WHEN total < 0 THEN 'return' + ELSE type + END AS type, + +FROM {{ ref('stg_orders') }} +``` + +The `dbt_utils.star()` macro will print out the full list of columns in the table, but skip the ones I’ve listed in the except list, which allows me to perform the same logic while writing fewer lines of code. This is a simple example of using dbt macros to simplify and shorten your code, and dbt can get a lot more sophisticated as you learn more techniques. Read more about the [dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and the [star macro](https://github.com/dbt-labs/dbt-utils/tree/0.8.6/#star-source). + +## Map DELETEs + +One of the biggest differences between a procedural transformation and how dbt models data is that dbt, in general, will never destroy data. While there are ways to execute hard `DELETE`s in dbt that are outside of the scope of this article, the general best practice for handling deleted data is to just use soft deletes, and filter out soft-deleted data in a final transformation. + +Let’s consider a simple example query: + +```sql +DELETE FROM stg_orders WHERE order_status IS NULL +``` + +In a dbt model, you’ll need to first identify the records that should be deleted and then filter them out. There are really two primary ways you might translate this query: + +```sql +SELECT * FROM {{ ref('stg_orders') }} WHERE order_status IS NOT NULL +``` + +This first approach just inverts the logic of the DELETE to describe the set of records that should remain, instead of the set of records that should be removed. This ties back to the way dbt declaratively describes datasets. You reference the data that should be in a dataset, and the table or view gets created with that set of data. + +Another way you could achieve this is by marking the deleted records, and then filtering them out. For example: + +```sql +WITH + +soft_deletes AS ( + + SELECT + *, + CASE + WHEN order_status IS NULL THEN true + ELSE false + END AS to_delete + + FROM {{ ref('stg_orders') }} + +) + +SELECT * FROM soft_deletes WHERE to_delete = false +``` + +This approach flags all of the deleted records, and the final `SELECT` filters out any deleted data, so the resulting table contains only the remaining records. It’s a lot more verbose than just inverting the `DELETE` logic, but for complex `DELETE` logic, this ends up being a very effective way of performing the `DELETE` that retains historical context. + +It’s worth calling out that while this doesn’t enable a hard delete, hard deletes can be executed a number of ways, the most common being to execute a dbt [macros](/docs/build/jinja-macros) via as a [run-operation](https://docs.getdbt.com/reference/commands/run-operation), or by using a [post-hook](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook/) to perform a `DELETE` statement after the records to-be-deleted have been marked. These are advanced approaches outside the scope of this guide. + + +## Map MERGEs +dbt has a concept called [materialization](/docs/build/materializations), which determines how a model is physically or logically represented in the warehouse. `INSERT`s, `UPDATE`s, and `DELETE`s will typically be accomplished using table or view materializations. For incremental workloads accomplished via commands like `MERGE` or `UPSERT`, dbt has a particular materialization called [incremental](/docs/build/incremental-models). The incremental materialization is specifically used to handle incremental loads and updates to a table without recreating the entire table from scratch on every run. + +### Step 1: Map the MERGE like an INSERT/UPDATE to start + +Before we get into the exact details of how to implement an incremental materialization, let’s talk about logic conversion. Extracting the logic of the `MERGE` and handling it as you would an `INSERT` or an `UPDATE` is the easiest way to get started migrating a `MERGE` command. . + +To see how the logic conversion works, we’ll start with an example `MERGE`. In this scenario, imagine a ride sharing app where rides are loaded into a details table daily, and tips may be updated at some later date, and need to be kept up-to-date: + +```sql +MERGE INTO ride_details USING ( + SELECT + ride_id, + subtotal, + tip + + FROM rides_to_load AS rtl + + ON ride_details.ride_id = rtl.ride_id + + WHEN MATCHED THEN UPDATE + + SET ride_details.tip = rtl.tip + + WHEN NOT MATCHED THEN INSERT (ride_id, subtotal, tip) + VALUES (rtl.ride_id, rtl.subtotal, NVL(rtl.tip, 0, rtl.tip) +); +``` + +The content of the `USING` clause is a useful piece of code because that can easily be placed in a CTE as a starting point for handling the match statement. I find that the easiest way to break this apart is to treat each match statement as a separate CTE that builds on the previous match statements. + +We can ignore the `ON` clause for now, as that will only come into play once we get to a point where we’re ready to turn this into an incremental. + +As with `UPDATE`s and `INSERT`s, you can use the `SELECT` list and aliases to name columns appropriately for the target table, and `UNION` together `INSERT` statements (taking care to use `UNION`, rather than `UNION ALL` to avoid duplicates). + +The `MERGE` would end up translating to something like this: + +```sql +WITH + +using_clause AS ( + + SELECT + ride_id, + subtotal, + tip + + FROM {{ ref('rides_to_load') }} + +), + +updates AS ( + + SELECT + ride_id, + subtotal, + tip + + FROM using_clause + +), + +inserts AS ( + + SELECT + ride_id, + subtotal, + NVL(tip, 0, tip) + + FROM using_clause + +) + +SELECT * + +FROM updates + +UNION inserts +``` + +To be clear, this transformation isn’t complete. The logic here is similar to the `MERGE`, but will not actually do the same thing, since the updates and inserts CTEs are both selecting from the same source query. We’ll need to ensure we grab the separate sets of data as we transition to the incremental materialization. + +One important caveat is that dbt does not natively support `DELETE` as a `MATCH` action. If you have a line in your `MERGE` statement that uses `WHEN MATCHED THEN DELETE`, you’ll want to treat it like an update and add a soft-delete flag, which is then filtered out in a follow-on transformation. + +### Step 2: Convert to incremental materialization + +As mentioned above, incremental materializations are a little special in that when the target table does not exist, the materialization functions in nearly the same way as a standard table materialization, and executes a `CREATE TABLE AS SELECT` statement. If the target table does exist, however, the materialization instead executes a `MERGE` statement. + +Since a `MERGE` requires a `JOIN` condition between the `USING` clause and the target table, we need a way to specify how dbt determines whether or not a record triggers a match or not. That particular piece of information is specified in the dbt model configuration. + +We can add the following `config()` block to the top of our model to specify how it should build incrementally: + +```sql +{{ + config( + materialized='incremental', + unique_key='ride_id', + incremental_strategy='merge' + ) +}} +``` + +The three configuration fields in this example are the most important ones. + +- Setting `materialized='incremental'` tells dbt to apply UPSERT logic to the target table. +- The `unique_key` should be a primary key of the target table. This is used to match records with the existing table. +- `incremental_strategy` here is set to MERGE any existing rows in the target table with a value for the `unique_key` which matches the incoming batch of data. There are [various incremental strategies](/docs/build/incremental-models#about-incremental_strategy) for different situations and warehouses. + +The bulk of the work in converting a model to an incremental materialization comes in determining how the logic should change for incremental loads versus full backfills or initial loads. dbt offers a special macro, `is_incremental()`, which evaluates false for initial loads or for backfills (called full refreshes in dbt parlance), but true for incremental loads. + +This macro can be used to augment the model code to adjust how data is loaded for subsequent loads. How that logic should be added will depend a little bit on how data is received. Some common ways might be: + +1. The source table is truncated ahead of incremental loads, and only contains the data to be loaded in that increment. +2. The source table contains all historical data, and there is a load timestamp column that identifies new data to be loaded. + +In the first case, the work is essentially done already. Since the source table always contains only the new data to be loaded, the query doesn’t have to change for incremental loads. The second case, however, requires the use of the `is_incremental()` macro to correctly handle the logic. + +Taking the converted `MERGE` statement that we’d put together previously, we’d augment it to add this additional logic: + +```sql +WITH + +using_clause AS ( + + SELECT + ride_id, + subtotal, + tip, + max(load_timestamp) as load_timestamp + + FROM {{ ref('rides_to_load') }} + + + {% if is_incremental() %} + + WHERE load_timestamp > (SELECT max(load_timestamp) FROM {{ this }}) + + {% endif %} + +), + +updates AS ( + + SELECT + ride_id, + subtotal, + tip, + load_timestamp + + FROM using_clause + + {% if is_incremental() %} + + WHERE ride_id IN (SELECT ride_id FROM {{ this }}) + + {% endif %} + +), + +inserts AS ( + + SELECT + ride_id, + subtotal, + NVL(tip, 0, tip), + load_timestamp + + FROM using_clause + + WHERE ride_id NOT IN (SELECT ride_id FROM updates) + +) + +SELECT * FROM updates UNION inserts +``` + +There are a couple important concepts to understand here: + +1. The code in the `is_incremental()` conditional block only executes for incremental executions of this model code. If the target table doesn’t exist, or if the `--full-refresh` option is used, that code will not execute. +2. `{{ this }}` is a special keyword in dbt that when used in a Jinja block, self-refers to the model for which the code is executing. So if you have a model in a file called `my_incremental_model.sql`, `{{ this }}` will refer to `my_incremental_model` (fully qualified with database and schema name if necessary). By using that keyword, we can leverage the current state of the target table to inform the source query. + + +## Migrate Stores procedures + +The techniques shared above are useful ways to get started converting the individual DML statements that are often found in stored procedures. Using these types of patterns, legacy procedural code can be rapidly transitioned to dbt models that are much more readable, maintainable, and benefit from software engineering best practices like DRY principles. Additionally, once transformations are rewritten as dbt models, it becomes much easier to test the transformations to ensure that the data being used downstream is high-quality and trustworthy. diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md deleted file mode 100644 index aae8b373b2c..00000000000 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: Migrating from DDL, DML, and stored procedures -id: 1-migrating-from-stored-procedures ---- - -One of the more common situations that new dbt adopters encounter is a historical codebase of transformations written as a hodgepodge of DDL and DML statements, or stored procedures. Going from DML statements to dbt models is often a challenging hump for new users to get over, because the process involves a significant paradigm shift between a procedural flow of building a dataset (e.g. a series of DDL and DML statements) to a declarative approach to defining a dataset (e.g. how dbt uses SELECT statements to express data models). This guide aims to provide tips, tricks, and common patterns for converting DML statements to dbt models. - -## Preparing to migrate - -Before getting into the meat of conversion, it’s worth noting that DML statements will not always illustrate a comprehensive set of columns and column types that an original table might contain. Without knowing the DDL to create the table, it’s impossible to know precisely if your conversion effort is apples-to-apples, but you can generally get close. - -If your supports `SHOW CREATE TABLE`, that can be a quick way to get a comprehensive set of columns you’ll want to recreate. If you don’t have the DDL, but are working on a substantial stored procedure, one approach that can work is to pull column lists out of any DML statements that modify the table, and build up a full set of the columns that appear. - -As for ensuring that you have the right column types, since models materialized by dbt generally use `CREATE TABLE AS SELECT` or `CREATE VIEW AS SELECT` as the driver for object creation, tables can end up with unintended column types if the queries aren’t explicit. For example, if you care about `INT` versus `DECIMAL` versus `NUMERIC`, it’s generally going to be best to be explicit. The good news is that this is easy with dbt: you just cast the column to the type you intend. - -We also generally recommend that column renaming and type casting happen as close to the source tables as possible, typically in a layer of staging transformations, which helps ensure that future dbt modelers will know where to look for those transformations! See [How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) for more guidance on overall project structure. - -### Operations we need to map - -There are four primary DML statements that you are likely to have to convert to dbt operations while migrating a procedure: - -- `INSERT` -- `UPDATE` -- `DELETE` -- `MERGE` - -Each of these can be addressed using various techniques in dbt. Handling `MERGE`s is a bit more involved than the rest, but can be handled effectively via dbt. The first three, however, are fairly simple to convert. diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md deleted file mode 100644 index d8f31a0f14a..00000000000 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: Inserts -id: 2-inserts ---- - -An `INSERT` statement is functionally the same as using dbt to `SELECT` from an existing source or other dbt model. If you are faced with an `INSERT`-`SELECT` statement, the easiest way to convert the statement is to just create a new dbt model, and pull the `SELECT` portion of the `INSERT` statement out of the procedure and into the model. That’s basically it! - -To really break it down, let’s consider a simple example: - -```sql -INSERT INTO returned_orders (order_id, order_date, total_return) - -SELECT order_id, order_date, total FROM orders WHERE type = 'return' -``` - -Converting this with a first pass to a [dbt model](/quickstarts/bigquery?step=8) (in a file called returned_orders.sql) might look something like: - -```sql -SELECT - order_id as order_id, - order_date as order_date, - total as total_return - -FROM {{ ref('orders') }} - -WHERE type = 'return' -``` - -Functionally, this would create a model (which could be materialized as a table or view depending on needs) called `returned_orders` that contains three columns: `order_id`, `order_date`, `total_return`) predicated on the type column. It achieves the same end as the `INSERT`, just in a declarative fashion, using dbt. - -## **A note on `FROM` clauses** - -In dbt, using a hard-coded table or view name in a `FROM` clause is one of the most serious mistakes new users make. dbt uses the ref and source macros to discover the ordering that transformations need to execute in, and if you don’t use them, you’ll be unable to benefit from dbt’s built-in lineage generation and pipeline execution. In the sample code throughout the remainder of this article, we’ll use ref statements in the dbt-converted versions of SQL statements, but it is an exercise for the reader to ensure that those models exist in their dbt projects. - -## **Sequential `INSERT`s to an existing table can be `UNION ALL`’ed together** - -Since dbt models effectively perform a single `CREATE TABLE AS SELECT` (or if you break it down into steps, `CREATE`, then an `INSERT`), you may run into complexities if there are multiple `INSERT` statements in your transformation that all insert data into the same table. Fortunately, this is a simple thing to handle in dbt. Effectively, the logic is performing a `UNION ALL` between the `INSERT` queries. If I have a transformation flow that looks something like (ignore the contrived nature of the scenario): - -```sql -CREATE TABLE all_customers - -INSERT INTO all_customers SELECT * FROM us_customers - -INSERT INTO all_customers SELECT * FROM eu_customers -``` - -The dbt-ified version of this would end up looking something like: - -```sql -SELECT * FROM {{ ref('us_customers') }} - -UNION ALL - -SELECT * FROM {{ ref('eu_customers') }} -``` - -The logic is functionally equivalent. So if there’s another statement that `INSERT`s into a model that I’ve already created, I can just add that logic into a second `SELECT` statement that is just `UNION ALL`'ed with the first. Easy! diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/3-mapping-updates.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/3-mapping-updates.md deleted file mode 100644 index b6f0874fb6b..00000000000 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/3-mapping-updates.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Updates -id: 3-updates ---- - -`UPDATE`s start to increase the complexity of your transformations, but fortunately, they’re pretty darn simple to migrate, as well. The thought process that you go through when translating an `UPDATE` is quite similar to how an `INSERT` works, but the logic for the `SELECT` list in the dbt model is primarily sourced from the content in the `SET` section of the `UPDATE` statement. Let’s look at a simple example: - -```sql -UPDATE orders - -SET type = 'return' - -WHERE total < 0 -``` - -The way to look at this is similar to an `INSERT`-`SELECT` statement. The table being updated is the model you want to modify, and since this is an `UPDATE`, that model has likely already been created, and you can either: - -- add to it with subsequent transformations -- create an intermediate model that builds off of the original model – perhaps naming it something like `int_[entity]_[verb].sql`. - -The `SELECT` list should contain all of the columns for the table, but for the specific columns being updated by the DML, you’ll use the computation on the right side of the equals sign as the `SELECT`ed value. Then, you can use the target column name on the left of the equals sign as the column alias. - -If I were building an intermediate transformation from the above query would translate to something along the lines of: - -```sql -SELECT - CASE - WHEN total < 0 THEN 'return' - ELSE type - END AS type, - - order_id, - order_date - -FROM {{ ref('stg_orders') }} -``` - -Since the `UPDATE` statement doesn’t modify every value of the type column, we use a `CASE` statement to apply the contents’ `WHERE` clause. We still want to select all of the columns that should end up in the target table. If we left one of the columns out, it wouldn’t be passed through to the target table at all due to dbt’s declarative approach. - -Sometimes, you may not be sure what all the columns are in a table, or in the situation as above, you’re only modifying a small number of columns relative to the total number of columns in the table. It can be cumbersome to list out every column in the table, but fortunately dbt contains some useful utility macros that can help list out the full column list of a table. - -Another way I could have written the model a bit more dynamically might be: - -```sql -SELECT - {{ dbt_utils.star(from=ref('stg_orders'), except=['type']) }}, - CASE - WHEN total < 0 THEN 'return' - ELSE type - END AS type, - -FROM {{ ref('stg_orders') }} -``` - -The `dbt_utils.star()` macro will print out the full list of columns in the table, but skip the ones I’ve listed in the except list, which allows me to perform the same logic while writing fewer lines of code. This is a simple example of using dbt macros to simplify and shorten your code, and dbt can get a lot more sophisticated as you learn more techniques. Read more about the [dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and the [star macro](https://github.com/dbt-labs/dbt-utils/tree/0.8.6/#star-source). diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md deleted file mode 100644 index 1a8c6435d42..00000000000 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: Deletes -id: 4-deletes ---- - -One of the biggest differences between a procedural transformation and how dbt models data is that dbt, in general, will never destroy data. While there are ways to execute hard `DELETE`s in dbt that are outside of the scope of this article, the general best practice for handling deleted data is to just use soft deletes, and filter out soft-deleted data in a final transformation. - -Let’s consider a simple example query: - -```sql -DELETE FROM stg_orders WHERE order_status IS NULL -``` - -In a dbt model, you’ll need to first identify the records that should be deleted and then filter them out. There are really two primary ways you might translate this query: - -```sql -SELECT * FROM {{ ref('stg_orders') }} WHERE order_status IS NOT NULL -``` - -This first approach just inverts the logic of the DELETE to describe the set of records that should remain, instead of the set of records that should be removed. This ties back to the way dbt declaratively describes datasets. You reference the data that should be in a dataset, and the table or view gets created with that set of data. - -Another way you could achieve this is by marking the deleted records, and then filtering them out. For example: - -```sql -WITH - -soft_deletes AS ( - - SELECT - *, - CASE - WHEN order_status IS NULL THEN true - ELSE false - END AS to_delete - - FROM {{ ref('stg_orders') }} - -) - -SELECT * FROM soft_deletes WHERE to_delete = false -``` - -This approach flags all of the deleted records, and the final `SELECT` filters out any deleted data, so the resulting table contains only the remaining records. It’s a lot more verbose than just inverting the `DELETE` logic, but for complex `DELETE` logic, this ends up being a very effective way of performing the `DELETE` that retains historical context. - -It’s worth calling out that while this doesn’t enable a hard delete, hard deletes can be executed a number of ways, the most common being to execute a dbt [macros](/docs/build/jinja-macros) via as a [run-operation](https://docs.getdbt.com/reference/commands/run-operation), or by using a [post-hook](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook/) to perform a `DELETE` statement after the records to-be-deleted have been marked. These are advanced approaches outside the scope of this guide. diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md deleted file mode 100644 index d059ab9a258..00000000000 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: Merges -id: 5-merges ---- - -dbt has a concept called [materialization](/docs/build/materializations), which determines how a model is physically or logically represented in the warehouse. `INSERT`s, `UPDATE`s, and `DELETE`s will typically be accomplished using table or view materializations. For incremental workloads accomplished via commands like `MERGE` or `UPSERT`, dbt has a particular materialization called [incremental](/docs/build/incremental-models). The incremental materialization is specifically used to handle incremental loads and updates to a table without recreating the entire table from scratch on every run. - -## Step 1: Map the MERGE like an INSERT/UPDATE to start - -Before we get into the exact details of how to implement an incremental materialization, let’s talk about logic conversion. Extracting the logic of the `MERGE` and handling it as you would an `INSERT` or an `UPDATE` is the easiest way to get started migrating a `MERGE` command. . - -To see how the logic conversion works, we’ll start with an example `MERGE`. In this scenario, imagine a ride sharing app where rides are loaded into a details table daily, and tips may be updated at some later date, and need to be kept up-to-date: - -```sql -MERGE INTO ride_details USING ( - SELECT - ride_id, - subtotal, - tip - - FROM rides_to_load AS rtl - - ON ride_details.ride_id = rtl.ride_id - - WHEN MATCHED THEN UPDATE - - SET ride_details.tip = rtl.tip - - WHEN NOT MATCHED THEN INSERT (ride_id, subtotal, tip) - VALUES (rtl.ride_id, rtl.subtotal, NVL(rtl.tip, 0, rtl.tip) -); -``` - -The content of the `USING` clause is a useful piece of code because that can easily be placed in a CTE as a starting point for handling the match statement. I find that the easiest way to break this apart is to treat each match statement as a separate CTE that builds on the previous match statements. - -We can ignore the `ON` clause for now, as that will only come into play once we get to a point where we’re ready to turn this into an incremental. - -As with `UPDATE`s and `INSERT`s, you can use the `SELECT` list and aliases to name columns appropriately for the target table, and `UNION` together `INSERT` statements (taking care to use `UNION`, rather than `UNION ALL` to avoid duplicates). - -The `MERGE` would end up translating to something like this: - -```sql -WITH - -using_clause AS ( - - SELECT - ride_id, - subtotal, - tip - - FROM {{ ref('rides_to_load') }} - -), - -updates AS ( - - SELECT - ride_id, - subtotal, - tip - - FROM using_clause - -), - -inserts AS ( - - SELECT - ride_id, - subtotal, - NVL(tip, 0, tip) - - FROM using_clause - -) - -SELECT * - -FROM updates - -UNION inserts -``` - -To be clear, this transformation isn’t complete. The logic here is similar to the `MERGE`, but will not actually do the same thing, since the updates and inserts CTEs are both selecting from the same source query. We’ll need to ensure we grab the separate sets of data as we transition to the incremental materialization. - -One important caveat is that dbt does not natively support `DELETE` as a `MATCH` action. If you have a line in your `MERGE` statement that uses `WHEN MATCHED THEN DELETE`, you’ll want to treat it like an update and add a soft-delete flag, which is then filtered out in a follow-on transformation. - -### Step 2: Convert to incremental materialization - -As mentioned above, incremental materializations are a little special in that when the target table does not exist, the materialization functions in nearly the same way as a standard table materialization, and executes a `CREATE TABLE AS SELECT` statement. If the target table does exist, however, the materialization instead executes a `MERGE` statement. - -Since a `MERGE` requires a `JOIN` condition between the `USING` clause and the target table, we need a way to specify how dbt determines whether or not a record triggers a match or not. That particular piece of information is specified in the dbt model configuration. - -We can add the following `config()` block to the top of our model to specify how it should build incrementally: - -```sql -{{ - config( - materialized='incremental', - unique_key='ride_id', - incremental_strategy='merge' - ) -}} -``` - -The three configuration fields in this example are the most important ones. - -- Setting `materialized='incremental'` tells dbt to apply UPSERT logic to the target table. -- The `unique_key` should be a primary key of the target table. This is used to match records with the existing table. -- `incremental_strategy` here is set to MERGE any existing rows in the target table with a value for the `unique_key` which matches the incoming batch of data. There are [various incremental strategies](/docs/build/incremental-models#about-incremental_strategy) for different situations and warehouses. - -The bulk of the work in converting a model to an incremental materialization comes in determining how the logic should change for incremental loads versus full backfills or initial loads. dbt offers a special macro, `is_incremental()`, which evaluates false for initial loads or for backfills (called full refreshes in dbt parlance), but true for incremental loads. - -This macro can be used to augment the model code to adjust how data is loaded for subsequent loads. How that logic should be added will depend a little bit on how data is received. Some common ways might be: - -1. The source table is truncated ahead of incremental loads, and only contains the data to be loaded in that increment. -2. The source table contains all historical data, and there is a load timestamp column that identifies new data to be loaded. - -In the first case, the work is essentially done already. Since the source table always contains only the new data to be loaded, the query doesn’t have to change for incremental loads. The second case, however, requires the use of the `is_incremental()` macro to correctly handle the logic. - -Taking the converted `MERGE` statement that we’d put together previously, we’d augment it to add this additional logic: - -```sql -WITH - -using_clause AS ( - - SELECT - ride_id, - subtotal, - tip, - max(load_timestamp) as load_timestamp - - FROM {{ ref('rides_to_load') }} - - - {% if is_incremental() %} - - WHERE load_timestamp > (SELECT max(load_timestamp) FROM {{ this }}) - - {% endif %} - -), - -updates AS ( - - SELECT - ride_id, - subtotal, - tip, - load_timestamp - - FROM using_clause - - {% if is_incremental() %} - - WHERE ride_id IN (SELECT ride_id FROM {{ this }}) - - {% endif %} - -), - -inserts AS ( - - SELECT - ride_id, - subtotal, - NVL(tip, 0, tip), - load_timestamp - - FROM using_clause - - WHERE ride_id NOT IN (SELECT ride_id FROM updates) - -) - -SELECT * FROM updates UNION inserts -``` - -There are a couple important concepts to understand here: - -1. The code in the `is_incremental()` conditional block only executes for incremental executions of this model code. If the target table doesn’t exist, or if the `--full-refresh` option is used, that code will not execute. -2. `{{ this }}` is a special keyword in dbt that when used in a Jinja block, self-refers to the model for which the code is executing. So if you have a model in a file called `my_incremental_model.sql`, `{{ this }}` will refer to `my_incremental_model` (fully qualified with database and schema name if necessary). By using that keyword, we can leverage the current state of the target table to inform the source query. diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion.md deleted file mode 100644 index 6fddf15c163..00000000000 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -title: Putting it all together -id: 6-migrating-from-stored-procedures-conclusion ---- - -The techniques shared above are useful ways to get started converting the individual DML statements that are often found in stored procedures. Using these types of patterns, legacy procedural code can be rapidly transitioned to dbt models that are much more readable, maintainable, and benefit from software engineering best practices like DRY principles. Additionally, once transformations are rewritten as dbt models, it becomes much easier to test the transformations to ensure that the data being used downstream is high-quality and trustworthy. diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md deleted file mode 100644 index a377554c317..00000000000 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Airflow and dbt Cloud -id: 1-airflow-and-dbt-cloud ---- - -In some cases, [Airflow](https://airflow.apache.org/) may be the preferred orchestrator for your organization over working fully within dbt Cloud. There are a few reasons your team might be considering using Airflow to orchestrate your dbt jobs: - -- Your team is already using Airflow to orchestrate other processes -- Your team needs to ensure that a [dbt job](https://docs.getdbt.com/docs/dbt-cloud/cloud-overview#schedule-and-run-dbt-jobs-in-production) kicks off before or after another process outside of dbt Cloud -- Your team needs flexibility to manage more complex scheduling, such as kicking off one dbt job only after another has completed -- Your team wants to own their own orchestration solution -- You need code to work right now without starting from scratch - -## How are people using Airflow + dbt today? - -### Airflow + dbt Core - -There are so many great examples from Gitlab through their open source data engineering work. Example: [here](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py). This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details: [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/) - -### Airflow + dbt Cloud API w/Custom Scripts - -This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available: [here](https://registry.astronomer.io/providers/dbt-cloud?type=Sensors&utm_campaign=Monthly%20Product%20Updates&utm_medium=email&_hsmi=208603877&utm_content=208603877&utm_source=hs_email) - -There are many different permutations of this over time: - -- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on custom python API utilities [here](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py) -- [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want -- [Other ways to run dbt in airflow](/docs/deploy/deployments#airflow): Official dbt Docs on how teams are running dbt in airflow - -## This guide's process - -These solutions are great, but can be difficult to trust as your team grows and management for things like: testing, job definitions, secrets, and pipelines increase past your team’s capacity. Roles become blurry (or were never clearly defined at the start!). Both data and analytics engineers start digging through custom logging within each other’s workflows to make heads or tails of where and what the issue really is. Not to mention that when the issue is found, it can be even harder to decide on the best path forward for safely implementing fixes. This complex workflow and unclear delineation on process management results in a lot of misunderstandings and wasted time just trying to get the process to work smoothly! - -### A better way - -After today’s walkthrough, you’ll get hands-on experience: - -1. Creating a working local Airflow environment -2. Invoking a dbt Cloud job with Airflow (with proof!) -3. Reusing tested and trusted Airflow code for your specific use cases - -While you’re learning the ropes, you’ll also gain a better understanding of how this helps to: - -- Reduce the cognitive load when building and maintaining pipelines -- Avoid dependency hell (think: `pip install` conflicts) -- Implement better recoveries from failures -- Define clearer workflows so that data and analytics engineers work better, together ♥️ - -### Prerequisites - -- [dbt Cloud Teams or Enterprise account](https://www.getdbt.com/pricing/) (with [admin access](https://docs.getdbt.com/docs/cloud/manage-access/enterprise-permissions)) in order to create a service token. Permissions for service tokens can be found [here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens). -- A [free Docker account](https://hub.docker.com/signup) in order to sign in to Docker Desktop, which will be installed in the initial setup. -- A local digital scratchpad for temporarily copy-pasting API keys and URLs - -🙌 Let’s get started! 🙌 diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md deleted file mode 100644 index 9c3b8eb7f1b..00000000000 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: Setting up Airflow and dbt Cloud -id: 2-setting-up-airflow-and-dbt-cloud ---- - -## 1. Install the Astro CLI - -Astro is a managed software service that includes key features for teams working with Airflow. In order to use Astro, we’ll install the Astro CLI, which will give us access to useful commands for working with Airflow locally. You can read more about Astro [here](https://docs.astronomer.io/astro/). - -In this example, we’re using Homebrew to install Astro CLI. Follow the instructions to install the Astro CLI for your own operating system [here](https://docs.astronomer.io/astro/install-cli). - -```bash -brew install astro -``` - - - -## 2. Install and start Docker Desktop - -Docker allows us to spin up an environment with all the apps and dependencies we need for the example. - -Follow the instructions [here](https://docs.docker.com/desktop/) to install Docker desktop for your own operating system. Once Docker is installed, ensure you have it up and running for the next steps. - - - -## 3. Clone the airflow-dbt-cloud repository - -Open your terminal and clone the [airflow-dbt-cloud repository](https://github.com/sungchun12/airflow-dbt-cloud.git). This contains example Airflow DAGs that you’ll use to orchestrate your dbt Cloud job. Once cloned, navigate into the `airflow-dbt-cloud` project. - -```bash -git clone https://github.com/sungchun12/airflow-dbt-cloud.git -cd airflow-dbt-cloud -``` - - - -## 4. Start the Docker container - -You can initialize an Astronomer project in an empty local directory using a Docker container, and then run your project locally using the `start` command. - -1. Run the following commands to initialize your project and start your local Airflow deployment: - - ```bash - astro dev init - astro dev start - ``` - - When this finishes, you should see a message similar to the following: - - ```bash - Airflow is starting up! This might take a few minutes… - - Project is running! All components are now available. - - Airflow Webserver: http://localhost:8080 - Postgres Database: localhost:5432/postgres - The default Airflow UI credentials are: admin:admin - The default Postrgres DB credentials are: postgres:postgres - ``` - -2. Open the Airflow interface. Launch your web browser and navigate to the address for the **Airflow Webserver** from your output in Step 1. - - This will take you to your local instance of Airflow. You’ll need to log in with the **default credentials**: - - - Username: admin - - Password: admin - - ![Airflow login screen](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-login.png) - - - -## 5. Create a dbt Cloud service token - -Create a service token from within dbt Cloud using the instructions [found here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Ensure that you save a copy of the token, as you won’t be able to access this later. In this example we use `Account Admin`, but you can also use `Job Admin` instead for token permissions. - - - -## 6. Create a dbt Cloud job - -In your dbt Cloud account create a job, paying special attention to the information in the bullets below. Additional information for creating a dbt Cloud job can be found [here](/quickstarts/bigquery). - -- Configure the job with the commands that you want to include when this job kicks off, as Airflow will be referring to the job’s configurations for this rather than being explicitly coded in the Airflow DAG. This job will run a set of commands rather than a single command. -- Ensure that the schedule is turned **off** since we’ll be using Airflow to kick things off. -- Once you hit `save` on the job, make sure you copy the URL and save it for referencing later. The url will look similar to this: - -```html -https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/ -``` - - diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud.md deleted file mode 100644 index d6fd32bdba9..00000000000 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -title: Running Airflow and dbt Cloud -id: 3-running-airflow-and-dbt-cloud ---- - - - -Now you have all the working pieces to get up and running with Airflow + dbt Cloud. Let’s dive into make this all work together. We will **set up a connection** and **run a DAG in Airflow** that kicks off a dbt Cloud job. - -## 1. Add your dbt Cloud API token as a secure connection - -1. Navigate to Admin and click on **Connections** - - ![Airflow connections menu](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-connections-menu.png) - -2. Click on the `+` sign to add a new connection, then click on the drop down to search for the dbt Cloud Connection Type - - ![Create connection](/img/guides/orchestration/airflow-and-dbt-cloud/create-connection.png) - - ![Connection type](/img/guides/orchestration/airflow-and-dbt-cloud/connection-type.png) - -3. Add in your connection details and your default dbt Cloud account id. This is found in your dbt Cloud URL after the accounts route section (`/accounts/{YOUR_ACCOUNT_ID}`), for example the account with id 16173 would see this in their URL: `https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/` - -![https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2](https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2) - -## 2. Add your `job_id` and `account_id` config details to the python file: [dbt_cloud_provider_eltml.py](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/dags/dbt_cloud_provider_eltml.py) - -1. You’ll find these details within the dbt Cloud job URL, see the comments in the code snippet below for an example. - - ```python - # dbt Cloud Job URL: https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/ - # account_id: 16173 - #job_id: 65767 - - # line 28 - default_args={"dbt_cloud_conn_id": "dbt_cloud", "account_id": 16173}, - - trigger_dbt_cloud_job_run = DbtCloudRunJobOperator( - task_id="trigger_dbt_cloud_job_run", - job_id=65767, # line 39 - check_interval=10, - timeout=300, - ) - ``` - -2. Turn on the DAG and verify the job succeeded after running. Note: screenshots taken from different job runs, but the user experience is consistent. - - ![https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy](https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy) - - ![Airflow DAG](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-dag.png) - - ![Task run instance](/img/guides/orchestration/airflow-and-dbt-cloud/task-run-instance.png) - - ![https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS](https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS) - -## How do I rerun the dbt Cloud job and downstream tasks in my pipeline? - -If you have worked with dbt Cloud before, you have likely encountered cases where a job fails. In those cases, you have likely logged into dbt Cloud, investigated the error, and then manually restarted the job. - -This section of the guide will show you how to restart the job directly from Airflow. This will specifically run *just* the `trigger_dbt_cloud_job_run` and downstream tasks of the Airflow DAG and not the entire DAG. If only the transformation step fails, you don’t need to re-run the extract and load processes. Let’s jump into how to do that in Airflow. - -1. Click on the task - - ![Task DAG view](/img/guides/orchestration/airflow-and-dbt-cloud/task-dag-view.png) - -2. Clear the task instance - - ![Clear task instance](/img/guides/orchestration/airflow-and-dbt-cloud/clear-task-instance.png) - - ![Approve clearing](/img/guides/orchestration/airflow-and-dbt-cloud/approve-clearing.png) - -3. Watch it rerun in real time - - ![Re-run](/img/guides/orchestration/airflow-and-dbt-cloud/re-run.png) - -## Cleaning up - -At the end of this guide, make sure you shut down your docker container. When you’re done using Airflow, use the following command to stop the container: - -```bash -$ astrocloud dev stop - -[+] Running 3/3 - ⠿ Container airflow-dbt-cloud_e3fe3c-webserver-1 Stopped 7.5s - ⠿ Container airflow-dbt-cloud_e3fe3c-scheduler-1 Stopped 3.3s - ⠿ Container airflow-dbt-cloud_e3fe3c-postgres-1 Stopped 0.3s -``` - -To verify that the deployment has stopped, use the following command: - -```bash -astrocloud dev ps -``` - -This should give you an output like this: - -```bash -Name State Ports -airflow-dbt-cloud_e3fe3c-webserver-1 exited -airflow-dbt-cloud_e3fe3c-scheduler-1 exited -airflow-dbt-cloud_e3fe3c-postgres-1 exited -``` - - diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md deleted file mode 100644 index 5766d8c0b79..00000000000 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: Airflow and dbt Cloud FAQs -id: 4-airflow-and-dbt-cloud-faqs ---- -## 1. How can we run specific subsections of the dbt DAG in Airflow? - -Because of the way we configured the dbt Cloud job to run in Airflow, you can leave this job to your analytics engineers to define in the job configurations from dbt Cloud. If, for example, we need to run hourly-tagged models every hour and daily-tagged models daily, we can create jobs like `Hourly Run` or `Daily Run` and utilize the commands `dbt run -s tag:hourly` and `dbt run -s tag:daily` within each, respectively. We only need to grab our dbt Cloud `account` and `job id`, configure it in an Airflow DAG with the code provided, and then we can be on your way. See more node selection options: [here](/reference/node-selection/syntax) - -## 2. How can I re-run models from the point of failure? - -You may want to parse the dbt DAG in Airflow to get the benefit of re-running from the point of failure. However, when you have hundreds of models in your DAG expanded out, it becomes useless for diagnosis and rerunning due to the overhead that comes along with creating an expansive Airflow DAG. - -You can’t re-run from failure natively in dbt Cloud today (feature coming!), but you can use a custom rerun parser. - -Using a simple python script coupled with the dbt Cloud provider, you can: - -- Avoid managing artifacts in a separate storage bucket(dbt Cloud does this for you) -- Avoid building your own parsing logic -- Get clear logs on what models you're rerunning in dbt Cloud (without hard coding step override commands) - -Watch the video below to see how it works! - - - -## 3. Should Airflow run one big dbt job or many dbt jobs? - -Overall we recommend being as purposeful and minimalistic as you can. This is because dbt manages all of the dependencies between models and the orchestration of running those dependencies in order, which in turn has benefits in terms of warehouse processing efforts. - -## 4. We want to kick off our dbt jobs after our ingestion tool (such as Fivetran) / data pipelines are done loading data. Any best practices around that? - -Our friends at Astronomer answer this question with this example: [here](https://registry.astronomer.io/dags/fivetran-dbt-cloud-census) - -## 5. How do you set up a CI/CD workflow with Airflow? - -Check out these two resources for accomplishing your own CI/CD pipeline: - -- [Continuous Integration with dbt Cloud](/docs/deploy/continuous-integration) -- [Astronomer's CI/CD Example](https://docs.astronomer.io/software/ci-cd/#example-cicd-workflow) - -## 6. Can dbt dynamically create tasks in the DAG like Airflow can? - -We prefer to keep models bundled vs. unbundled. You can go this route, but if you have hundreds of dbt models, it’s more effective to let the dbt Cloud job handle the models and dependencies. Bundling provides the solution to clear observability when things go wrong - we've seen more success in having the ability to clearly see issues in a bundled dbt Cloud job than combing through the nodes of an expansive Airflow DAG. If you still have a use case for this level of control though, our friends at Astronomer answer this question [here](https://www.astronomer.io/blog/airflow-dbt-1/)! - -## 7. Can you trigger notifications if a dbt job fails with Airflow? Is there any way to access the status of the dbt Job to do that? - -Yes, either through [Airflow's email/slack](https://www.astronomer.io/guides/error-notifications-in-airflow/) functionality by itself or combined with [dbt Cloud's notifications](/docs/deploy/job-notifications), which support email and slack notifications. - -## 8. Are there decision criteria for how to best work with dbt Cloud and airflow? - -Check out this deep dive into planning your dbt Cloud + Airflow implementation [here](https://www.youtube.com/watch?v=n7IIThR8hGk)! diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md b/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md deleted file mode 100644 index 048fe637de0..00000000000 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: Customizing CI/CD -id: 1-cicd-background ---- - -# Creating Custom CI/CD Pipelines - -One of the core tenets of dbt is that analytic code should be version controlled. This provides a ton of benefit to your organization in terms of collaboration, code consistency, stability, and the ability to roll back to a prior version. There’s an additional benefit that is provided with your code hosting platform that is often overlooked or underutilized. Some of you may have experience using dbt Cloud’s [webhook functionality](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) to run a job when a PR is created. This is a fantastic capability, and meets most use cases for testing your code before merging to production. However, there are circumstances when an organization needs additional functionality, like running workflows on every commit (linting), or running workflows after a merge is complete. In this article, we will show you how to setup custom pipelines to lint your project and trigger a dbt Cloud job via the API. - -A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch. - - -## What are pipelines? - -Pipelines (which are known by many names, such as workflows, actions, or build steps) are a series of pre-defined jobs that are triggered by specific events in your repository (PR created, commit pushed, branch merged, etc). Those jobs can do pretty much anything your heart desires assuming you have the proper security access and coding chops. - -Jobs are executed on [runners](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#runners), which are virtual servers. The runners come pre-configured with Ubuntu Linux, macOS, or Windows. That means the commands you execute are determined by the operating system of your runner. You’ll see how this comes into play later in the setup, but for now just remember that your code is executed on virtual servers that are, typically, hosted by the code hosting platform. - -![Diagram of how pipelines work](/img/guides/orchestration/custom-cicd-pipelines/pipeline-diagram.png) - -Please note, runners hosted by your code hosting platform provide a certain amount of free time. After that, billing charges may apply depending on how your account is setup. You also have the ability to host your own runners. That is beyond the scope of this article, but checkout the links below for more information if you’re interested in setting that up: - -- Repo-hosted runner billing information: - - [GitHub](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions) - - [GitLab](https://docs.gitlab.com/ee/ci/pipelines/cicd_minutes.html) - - [Bitbucket](https://bitbucket.org/product/features/pipelines#) -- Self-hosted runner information: - - [GitHub](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners) - - [GitLab](https://docs.gitlab.com/runner/) - - [Bitbucket](https://support.atlassian.com/bitbucket-cloud/docs/runners/) - -Additionally, if you’re using the free tier of GitLab you can still follow this guide, but it may ask you to provide a credit card to verify your account. You’ll see something like this the first time you try to run a pipeline: - -![Warning from GitLab showing payment information is required](/img/guides/orchestration/custom-cicd-pipelines/gitlab-cicd-payment-warning.png) - - -## How to setup pipelines - -This guide provides details for multiple code hosting platforms. Where steps are unique, they are presented without a selection option. If code is specific to a platform (i.e. GitHub, GitLab, Bitbucket) you will see a selection option for each. - -Pipelines can be triggered by various events. The [dbt Cloud webhook](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) process already triggers a run if you want to run your jobs on a merge request, so this guide focuses on running pipelines for every push and when PRs are merged. Since pushes happen frequently in a project, we’ll keep this job super simple and fast by linting with SQLFluff. The pipeline that runs on merge requests will run less frequently, and can be used to call the dbt Cloud API to trigger a specific job. This can be helpful if you have specific requirements that need to happen when code is updated in production, like running a `--full-refresh` on all impacted incremental models. - -Here’s a quick look at what this pipeline will accomplish: - -![Diagram showing the pipelines to be created and the programs involved](/img/guides/orchestration/custom-cicd-pipelines/pipeline-programs-diagram.png) diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md b/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md deleted file mode 100644 index 465994e4442..00000000000 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md +++ /dev/null @@ -1,191 +0,0 @@ ---- -title: Lint code on push -id: 2-lint-on-push ---- - -This section shows a very basic example of linting a project every time a commit is pushed to the repo. While it is simple, it shows the power of CI and can be expanded on to meet the needs of your organization. - -The steps below use [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. In the example, it's set to use the `snowflake` dialect, and specifically runs the rules L019, L020, L021, and L022. This is purely for demonstration purposes. You should update this to reflect your code base's [dialect](https://docs.sqlfluff.com/en/stable/dialects.html) and the [rules](https://docs.sqlfluff.com/en/stable/rules.html) you've established for your repo. - -### 1. Create a YAML file to define your pipeline - -The YAML files defined below are what tell your code hosting platform the steps to run. In this setup, you’re telling the platform to run a SQLFluff lint job every time a commit is pushed. - - - - -In order for GitHub to know that you want to run an action, you need to have a few specific folders in your project. Add a new folder named `.github`, and within that folder add a new one named `workflows`. Your final folder structure will look like this: - -```sql -my_awesome_project -├── .github -│ ├── workflows -│ │ └── lint_on_push.yml -``` - -To define the job for our action, let’s add a new file named `lint_on_push.yml` under the `workflows` folder. This file is how we tell the GitHub runner what to execute when the job is triggered. - -Below I touch on the important pieces for running a dbt Cloud job, but if you want a full run-down of all the components of this YAML file checkout [this GitHub article](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on actions. - -**Key pieces:** - -- `on:` - this is used to filter when the pipeline is run. In this example we’re running it on every push except for pushes to branches named `main`. For more filters, checkout [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). -- `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job -- `uses:` - remember the virtual servers we covered in the background section? They’re just empty operating systems, so there are two pieces of setup that are needed in order to access the code in your repo, and setup Python correctly on the virtual server. These two actions are called from other repos in GitHub to provide those services. For more information on them, checkout their repos: [actions/checkout](https://github.com/actions/checkout#checkout-v3) and [actions/setup-python](https://github.com/actions/setup-python#setup-python-v3). -- `run:` - this is how we’re telling the GitHub runner to execute the Python script we defined above. - -```yaml -name: lint dbt project on push - -on: - push: - branches-ignore: - - 'main' - -jobs: -# this job runs SQLFluff with a specific set of rules - # note the dialect is set to Snowflake, so make that specific to your setup - # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html - lint_project: - name: Run SQLFluff linter - runs-on: ubuntu-latest - - steps: - - uses: "actions/checkout@v3" - - uses: "actions/setup-python@v4" - with: - python-version: "3.9" - - name: Install SQLFluff - run: "pip install sqlfluff==0.13.1" - - name: Lint project - run: "sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022" - -``` - - - - -Create a `.gitlab-ci.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file. - -```sql -my_awesome_project -├── dbt_project.yml -├── .gitlab-ci.yml -``` - -**Key pieces:** - -- `image: python:3.9` - this defines the virtual image we’re using to run the job -- `rules:` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event except when the branch is named `main`. Filters are very powerful to run commands on specific events, and you can find a full list in [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules). -- `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above. - -```yaml -image: python:3.9 - -stages: - - pre-build - -# this job runs SQLFluff with a specific set of rules -# note the dialect is set to Snowflake, so make that specific to your setup -# details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html -lint-project: - stage: pre-build - rules: - - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main' - script: - - pip install sqlfluff==0.13.1 - - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 -``` - - - - -Create a `bitbucket-pipelines.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file. - -```sql -my_awesome_project -├── bitbucket-pipelines.yml -├── dbt_project.yml -``` - -**Key pieces:** - -- `image: python:3.11.1` - this defines the virtual image we’re using to run the job -- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `master`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/) -- `script:` - this is how we’re telling the Bitbucket runner to execute the Python script we defined above. - -```yaml -image: python:3.11.1 - - -pipelines: - branches: - '**': # this sets a wildcard to run on every branch - - step: - name: Lint dbt project - script: - - pip install sqlfluff==0.13.1 - - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 - - 'master': # override if your default branch doesn't run on a branch named "master" - - step: - script: - - python --version -``` - - - - -### 2. Commit and push your changes to make sure everything works - -After you finish creating the YAML files, commit and push your code. Doing this will trigger your pipeline for the first time! If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run. - - - - -In your repository, click the *Actions* tab - -![Image showing the GitHub action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-github.png) - -Sample output from SQLFluff in the `Run SQLFluff linter` job: - -![Image showing the logs in GitHub for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-github.png) - - - - -In the menu option go to *CI/CD > Pipelines* - -![Image showing the GitLab action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-gitlab.png) - -Sample output from SQLFluff in the `Run SQLFluff linter` job: - -![Image showing the logs in GitLab for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-gitlab.png) - - - - -In the left menu pane, click on *Pipelines* - -![Image showing the Bitbucket action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png) - -Sample output from SQLFluff in the `Run SQLFluff linter` job: - -![Image showing the logs in Bitbucket for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png) - - - diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md b/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md deleted file mode 100644 index 8a6f8965b87..00000000000 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md +++ /dev/null @@ -1,131 +0,0 @@ ---- -title: Run a dbt Cloud job on pull request -id: 4-dbt-cloud-job-on-pr ---- - -:::info Run on PR - -If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/slim-ci-jobs). -This section is only for those projects that connect to their git repository using an SSH key. - -::: - -If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of Slim CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created. - -The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-3 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)** - -### 4. Create a pipeline job that runs when PRs are created - - - -For this job, we'll set it up using the `bitbucket-pipelines.yml` file as in the prior step. The YAML file will look pretty similar to our earlier job, but we’ll pass in the required variables to the Python script using `export` statements. Update this section to match your setup based on the comments in the file. - -**What is this pipeline going to do?** -The setup below will trigger a dbt Cloud job to run every time a PR is opened in this repository. It will also run a fresh version of the pipeline for every commit that is made on the PR until it is merged. -For example: If you open a PR, it will run the pipeline. If you then decide additional changes are needed, and commit/push to the PR branch, a new pipeline will run with the updated code. - -The following varibles control this job: - - `DBT_JOB_BRANCH`: Tells the dbt Cloud job to run the code in the branch that created this PR - - `DBT_JOB_SCHEMA_OVERRIDE`: Tells the dbt Cloud job to run this into a custom target schema - - The format of this will look like: `DBT_CLOUD_PR_{REPO_KEY}_{PR_NUMBER}` - - -```yaml -image: python:3.11.1 - - -pipelines: - # This job will run when pull requests are created in the repository - pull-requests: - '**': - - step: - name: 'Run dbt Cloud PR Job' - script: - # Check to only build if PR destination is master (or other branch). - # Comment or remove line below if you want to run on all PR's regardless of destination branch. - - if [ "${BITBUCKET_PR_DESTINATION_BRANCH}" != "main" ]; then printf 'PR Destination is not master, exiting.'; exit; fi - - export DBT_URL="https://cloud.getdbt.com" - - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job" - - export DBT_JOB_BRANCH=$BITBUCKET_BRANCH - - export DBT_JOB_SCHEMA_OVERRIDE="DBT_CLOUD_PR_"$BITBUCKET_PROJECT_KEY"_"$BITBUCKET_PR_ID - - export DBT_ACCOUNT_ID=00000 # enter your account id here - - export DBT_PROJECT_ID=00000 # enter your project id here - - export DBT_PR_JOB_ID=00000 # enter your job id here - - python python/run_and_monitor_dbt_job.py -``` - - - - -### 5. Confirm the pipeline runs - -Now that you have a new pipeline, it's time to run it and make sure it works. Since this only triggers when a PR is created, you'll need to create a new PR on a branch that contains the code above. Once you do that, you should see a pipeline that looks like this: - - - - -Bitbucket pipeline: -![dbt run on PR job in Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png) - -dbt Cloud job: -![dbt Cloud job showing it was triggered by Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png) - - - - -### 6. Handle those extra schemas in your database - -As noted above, when the PR job runs it will create a new schema based on the PR. To avoid having your database overwhelmed with PR schemas, consider adding a "cleanup" job to your dbt Cloud account. This job can run on a scheduled basis to cleanup any PR schemas that haven't been updated/used recently. - -Add this as a macro to your project. It takes 2 arguments that lets you control which schema get dropped: - - `age_in_days`: The number of days since the schema was last altered before it should be dropped (default 10 days) - - `databse_to_clean`: The name of the database to remove schemas from - -```sql -{# - This macro finds PR schemas older than a set date and drops them - The maco defaults to 10 days old, but can be configued with the input argument age_in_days - Sample usage with different date: - dbt run-operation pr_schema_cleanup --args "{'database_to_clean': 'analytics','age_in_days':'15'}" -#} -{% macro pr_schema_cleanup(database_to_clean, age_in_days=10) %} - - {% set find_old_schemas %} - select - 'drop schema {{ database_to_clean }}.'||schema_name||';' - from {{ database_to_clean }}.information_schema.schemata - where - catalog_name = '{{ database_to_clean | upper }}' - and schema_name ilike 'DBT_CLOUD_PR%' - and last_altered <= (current_date() - interval '{{ age_in_days }} days') - {% endset %} - - {% if execute %} - - {{ log('Schema drop statements:' ,True) }} - - {% set schema_drop_list = run_query(find_old_schemas).columns[0].values() %} - - {% for schema_to_drop in schema_drop_list %} - {% do run_query(schema_to_drop) %} - {{ log(schema_to_drop ,True) }} - {% endfor %} - - {% endif %} - -{% endmacro %} -``` - -This macro goes into a dbt Cloud job that is run on a schedule. The command will look like this (text below for copy/paste): -![dbt Cloud job showing the run operation command for the cleanup macro](/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png) -`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"` \ No newline at end of file diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md b/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md deleted file mode 100644 index 6b39c5ce405..00000000000 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Something to Consider -id: 5-something-to-consider ---- - -Running dbt Cloud jobs through a CI/CD pipeline is a form of job orchestration. If you also run jobs using dbt Cloud’s built in scheduler, you now have 2 orchestration tools running jobs. The risk with this is that you could run into conflicts - you can imagine a case where you are triggering a pipeline on certain actions and running scheduled jobs in dbt Cloud, you would probably run into job clashes. The more tools you have, the more you have to make sure everything talks to each other. - -That being said, if **the only reason you want to use pipelines is for adding a lint check or run on merge**, you might decide the pros outweigh the cons, and as such you want to go with a hybrid approach. Just keep in mind that if two processes try and run the same job at the same time, dbt Cloud will queue the jobs and run one after the other. It’s a balancing act but can be accomplished with diligence to ensure you’re orchestrating jobs in a manner that does not conflict. \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md b/website/docs/guides/productionize-your-dbt-databricks-project.md similarity index 83% rename from website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md rename to website/docs/guides/productionize-your-dbt-databricks-project.md index 5da8cc6616b..b95d8ffd2dd 100644 --- a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md +++ b/website/docs/guides/productionize-your-dbt-databricks-project.md @@ -1,19 +1,27 @@ --- -title: Productionizing your dbt Databricks project -id: "productionizing-your-dbt-databricks-project" -sidebar_label: "Productionizing your dbt Databricks project" -description: "Learn how to deliver models to end users and use best practices to maintain production data" +title: Productionize your dbt Databricks project +id: productionize-your-dbt-databricks-project +description: "Learn how to deliver models to end users and use best practices to maintain production data." +displayText: Productionize your dbt Databricks project +hoverSnippet: Learn how to Productionize your dbt Databricks project. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'databricks' +hide_table_of_contents: true +tags: ['Databricks','dbt Core','dbt Cloud'] +level: 'Intermediate' +recently_updated: true --- +## Introduction Welcome to the third installment of our comprehensive series on optimizing and deploying your data pipelines using Databricks and dbt Cloud. In this guide, we'll dive into delivering these models to end users while incorporating best practices to ensure that your production data remains reliable and timely. -## Prerequisites +### Prerequisites -If you don't have any of the following requirements, refer to the instructions in the [setup guide](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project) to catch up: +If you don't have any of the following requirements, refer to the instructions in the [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project) for help meeting these requirements: -- You have [set up your Databricks and dbt Cloud environments](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project). -- You have [optimized your dbt models for peak performance](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks). +- You have [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project). +- You have [optimized your dbt models for peak performance](/guides/optimize-dbt-models-on-databricks). - You have created two catalogs in Databricks: *dev* and *prod*. - You have created Databricks Service Principal to run your production jobs. - You have at least one [deployment environment](/docs/deploy/deploy-environments) in dbt Cloud. @@ -35,16 +43,16 @@ Each dbt Cloud project can have multiple deployment environments, but only one d With your deployment environment set up, it's time to create a production job to run in your *prod* environment. -To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/dbt-cloud-job). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently. +To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/deploy-jobs). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently. Leveraging dbt Cloud's job scheduler allows data teams to own the entire transformation workflow. You don't need to learn and maintain additional tools for orchestration or rely on another team to schedule code written by your team. This end-to-end ownership simplifies the deployment process and accelerates the delivery of new data products. -Let’s [create a job](/docs/deploy/dbt-cloud-job#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog. +Let’s [create a job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog. 1. Create a new job by clicking **Deploy** in the header, click **Jobs** and then **Create job**. 2. **Name** the job “Daily refresh”. 3. Set the **Environment** to your *production* environment. - - This will allow the job to inherit the catalog, schema, credentials, and environment variables defined in the [setup guide](https://docs.getdbt.com/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project#defining-your-dbt-deployment-environment). + - This will allow the job to inherit the catalog, schema, credentials, and environment variables defined in [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project). 4. Under **Execution Settings** - Check the **Generate docs on run** checkbox to configure the job to automatically generate project docs each time this job runs. This will ensure your documentation stays evergreen as models are added and modified. - Select the **Run on source freshness** checkbox to configure dbt [source freshness](/docs/deploy/source-freshness) as the first step of this job. Your sources will need to be configured to [snapshot freshness information](/docs/build/sources#snapshotting-source-data-freshness) for this to drive meaningful insights. @@ -58,7 +66,7 @@ Let’s [create a job](/docs/deploy/dbt-cloud-job#create-and-schedule-jobs) in d - dbt build is more efficient than issuing separate commands for dbt run and dbt test separately because it will run then test each model before continuing. - We are excluding source data because we already tested it in step 2. - The fail-fast flag will make dbt exit immediately if a single resource fails to build. If other models are in-progress when the first model fails, then dbt will terminate the connections for these still-running models. -5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/job-triggers). You can enter specific days and timing or create a custom cron schedule. +5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/deploy-jobs#schedule-days). You can enter specific days and timing or create a custom cron schedule. - If you want your dbt Cloud job scheduled by another orchestrator, like Databricks Workflows, see the [Advanced Considerations](#advanced-considerations) section below. This is just one example of an all-or-nothing command list designed to minimize wasted computing. The [job command list](/docs/deploy/job-commands) and [selectors](/reference/node-selection/syntax) provide a lot of flexibility on how your DAG will execute. You may want to design yours to continue running certain models if others fail. You may want to set up multiple jobs to refresh models at different frequencies. See our [Job Creation Best Practices discourse](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) for more job design suggestions. @@ -67,7 +75,7 @@ After your job is set up and runs successfully, configure your **[project artifa This will be our main production job to refresh data that will be used by end users. Another job everyone should include in their dbt project is a continuous integration job. -### Add a CI job +## Add a CI job CI/CD, or Continuous Integration and Continuous Deployment/Delivery, has become a standard practice in software development for rapidly delivering new features and bug fixes while maintaining high quality and stability. dbt Cloud enables you to apply these practices to your data transformations. @@ -79,21 +87,21 @@ dbt allows you to write [tests](/docs/build/tests) for your data pipeline, which 2. **Development**: Running tests during development ensures that your code changes do not break existing assumptions, enabling developers to iterate faster by catching problems immediately after writing code. 3. **CI checks**: Automated CI jobs run and test your pipeline end-to end when a pull request is created, providing confidence to developers, code reviewers, and end users that the proposed changes are reliable and will not cause disruptions or data quality issues -Your CI job will ensure that the models build properly and pass any tests applied to them. We recommend creating a separate *test* environment and having a dedicated service principal. This will ensure the temporary schemas created during CI tests are in their own catalog and cannot unintentionally expose data to other users. Repeat the [steps](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project) used to create your *prod* environment to create a *test* environment. After setup, you should have: +Your CI job will ensure that the models build properly and pass any tests applied to them. We recommend creating a separate *test* environment and having a dedicated service principal. This will ensure the temporary schemas created during CI tests are in their own catalog and cannot unintentionally expose data to other users. Repeat the steps in [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project) to create your *prod* environment to create a *test* environment. After setup, you should have: - A catalog called *test* - A service principal called *dbt_test_sp* - A new dbt Cloud environment called *test* that defaults to the *test* catalog and uses the *dbt_test_sp* token in the deployment credentials -We recommend setting up a dbt Cloud Slim CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a Slim CI job, refer to [Set up Slim CI jobs](/docs/deploy/slim-ci-jobs) for details. +We recommend setting up a dbt Cloud CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a CI job, refer to [Set up CI jobs](/docs/deploy/ci-jobs) for details. With dbt tests and SlimCI, you can feel confident that your production data will be timely and accurate even while delivering at high velocity. -### Monitor your jobs +## Monitor your jobs Keeping a close eye on your dbt Cloud jobs is crucial for maintaining a robust and efficient data pipeline. By monitoring job performance and quickly identifying potential issues, you can ensure that your data transformations run smoothly. dbt Cloud provides three entry points to monitor the health of your project: run history, deployment monitor, and status tiles. -The [run history](/docs/deploy/dbt-cloud-job) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu. +The [run history](/docs/deploy/run-visibility#run-history) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu. The deployment monitor in dbt Cloud offers a higher-level view of your run history, enabling you to gauge the health of your data pipeline over an extended period of time. This feature includes information on run durations and success rates, allowing you to identify trends in job performance, such as increasing run times or more frequent failures. The deployment monitor also highlights jobs in progress, queued, and recent failures. To access the deployment monitor click on the dbt logo in the top left corner of the dbt Cloud UI. @@ -101,7 +109,7 @@ The deployment monitor in dbt Cloud offers a higher-level view of your run histo By adding [status tiles](/docs/deploy/dashboard-status-tiles) to your BI dashboards, you can give stakeholders visibility into the health of your data pipeline without leaving their preferred interface. Status tiles instill confidence in your data and help prevent unnecessary inquiries or context switching. To implement dashboard status tiles, you'll need to have dbt docs with [exposures](/docs/build/exposures) defined. -### Notifications +## Set up notifications Setting up [notifications](/docs/deploy/job-notifications) in dbt Cloud allows you to receive alerts via email or a Slack channel whenever a run ends. This ensures that the appropriate teams are notified and can take action promptly when jobs fail or are canceled. To set up notifications: @@ -109,9 +117,9 @@ Setting up [notifications](/docs/deploy/job-notifications) in dbt Cloud allows y 2. Select the **Notifications** tab. 3. Choose the desired notification type (Email or Slack) and configure the relevant settings. -If you require notifications through other means than email or Slack, you can use dbt Cloud's outbound [webhooks](/docs/deploy/webhooks) feature to relay job events to other tools. Webhooks enable you to [integrate dbt Cloud with a wide range of SaaS applications](/guides/orchestration/webhooks), extending your pipeline’s automation into other systems. +If you require notifications through other means than email or Slack, you can use dbt Cloud's outbound [webhooks](/docs/deploy/webhooks) feature to relay job events to other tools. Webhooks enable you to integrate dbt Cloud with a wide range of SaaS applications, extending your pipeline’s automation into other systems. -### Troubleshooting +## Troubleshooting When a disruption occurs in your production pipeline, it's essential to know how to troubleshoot issues effectively to minimize downtime and maintain a high degree of trust with your stakeholders. @@ -121,15 +129,14 @@ The five key steps for troubleshooting dbt Cloud issues are: 2. Inspect the problematic file and look for an immediate fix. 3. Isolate the problem by running one model at a time in the IDE or undoing the code that caused the issue. 4. Check for problems in compiled files and logs. -5. Seek help from the [dbt Cloud support team](/docs/dbt-support) if needed. -Consult the [Debugging errors documentation](/guides/best-practices/debugging-errors) for a comprehensive list of error types and diagnostic methods. +Consult the [Debugging errors documentation](/guides/debug-errors) for a comprehensive list of error types and diagnostic methods. To troubleshoot issues with a dbt Cloud job, navigate to the "Deploy > Run History" tab in your dbt Cloud project and select the failed run. Then, expand the run steps to view [console and debug logs](/docs/deploy/run-visibility#access-logs) to review the detailed log messages. To obtain additional information, open the Artifacts tab and download the compiled files associated with the run. If your jobs are taking longer than expected, use the [model timing](/docs/deploy/run-visibility#model-timing) dashboard to identify bottlenecks in your pipeline. Analyzing the time taken for each model execution helps you pinpoint the slowest components and optimize them for better performance. The Databricks [Query History](https://docs.databricks.com/sql/admin/query-history.html) lets you inspect granular details such as time spent in each task, rows returned, I/O performance, and execution plan. -For more on performance tuning, see our guide on [How to Optimize and Troubleshoot dbt Models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks). +For more on performance tuning, see our guide on [How to Optimize and Troubleshoot dbt Models on Databricks](/guides/optimize-dbt-models-on-databricks). ## Advanced considerations @@ -149,11 +156,11 @@ Inserting dbt Cloud jobs into a Databricks Workflows allows you to chain togethe - Logs and Run History: Accessing logs and run history becomes more convenient when using dbt Cloud. - Monitoring and Notification Features: dbt Cloud comes equipped with monitoring and notification features like the ones described above that can help you stay informed about the status and performance of your jobs. -To trigger your dbt Cloud job from Databricks, follow the instructions in our [Databricks Workflows to run dbt Cloud jobs guide](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs). +To trigger your dbt Cloud job from Databricks, follow the instructions in our [Databricks Workflows to run dbt Cloud jobs guide](/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs). -### Data masking +## Data masking -Our [Best Practices for dbt and Unity Catalog](/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices) guide recommends using separate catalogs *dev* and *prod* for development and deployment environments, with Unity Catalog and dbt Cloud handling configurations and permissions for environment isolation. Ensuring security while maintaining efficiency in your development and deployment environments is crucial. Additional security measures may be necessary to protect sensitive data, such as personally identifiable information (PII). +Our [Best Practices for dbt and Unity Catalog](/best-practices/dbt-unity-catalog-best-practices) guide recommends using separate catalogs *dev* and *prod* for development and deployment environments, with Unity Catalog and dbt Cloud handling configurations and permissions for environment isolation. Ensuring security while maintaining efficiency in your development and deployment environments is crucial. Additional security measures may be necessary to protect sensitive data, such as personally identifiable information (PII). Databricks leverages [Dynamic Views](https://docs.databricks.com/data-governance/unity-catalog/create-views.html#create-a-dynamic-view) to enable data masking based on group membership. Because views in Unity Catalog use Spark SQL, you can implement advanced data masking by using more complex SQL expressions and regular expressions. You can now also apply fine grained access controls like row filters in preview and column masks in preview on tables in Databricks Unity Catalog, which will be the recommended approach to protect sensitive data once this goes GA. Additionally, in the near term, Databricks Unity Catalog will also enable Attribute Based Access Control natively, which will make protecting sensitive data at scale simpler. @@ -180,10 +187,10 @@ Unity Catalog is a unified governance solution for your lakehouse. It provides a To get the most out of both tools, you can use the [persist docs config](/reference/resource-configs/persist_docs) to push table and column descriptions written in dbt into Unity Catalog, making the information easily accessible to both tools' users. Keeping the descriptions in dbt ensures they are version controlled and can be reproduced after a table is dropped. -## Additional resources +### Related docs - [Advanced deployments course](https://courses.getdbt.com/courses/advanced-deployment) if you want a deeper dive into these topics - [Autoscaling CI: The intelligent Slim CI](https://docs.getdbt.com/blog/intelligent-slim-ci) - [Trigger a dbt Cloud Job in your automated workflow with Python](https://discourse.getdbt.com/t/triggering-a-dbt-cloud-job-in-your-automated-workflow-with-python/2573) -- [Databricks + dbt Cloud Quickstart Guide](/quickstarts/databricks) +- [Databricks + dbt Cloud Quickstart Guide](/guides/databricks) - Reach out to your Databricks account team to get access to preview features on Databricks. diff --git a/website/docs/quickstarts/redshift-qs.md b/website/docs/guides/redshift-qs.md similarity index 97% rename from website/docs/quickstarts/redshift-qs.md rename to website/docs/guides/redshift-qs.md index fc7e178f163..890be27e50a 100644 --- a/website/docs/quickstarts/redshift-qs.md +++ b/website/docs/guides/redshift-qs.md @@ -1,9 +1,10 @@ --- title: "Quickstart for dbt Cloud and Redshift" -id: "redshift" -platform: 'dbt-cloud' +id: redshift +level: 'Beginner' icon: 'redshift' hide_table_of_contents: true +tags: ['Redshift', 'dbt Cloud','Quickstart'] --- ## Introduction @@ -31,8 +32,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen ### Related content - Learn more with [dbt Courses](https://courses.getdbt.com/collections) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) @@ -56,7 +57,7 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen -7. You might be asked to Configure account. For the purpose of this sandbox environment, we recommend selecting “Configure account”. +7. You might be asked to Configure account. For this sandbox environment, we recommend selecting “Configure account”. 8. Select your cluster from the list. In the **Connect to** popup, fill out the credentials from the output of the stack: - **Authentication** — Use the default which is **Database user name and password** (NOTE: IAM authentication is not supported in dbt Cloud). @@ -81,8 +82,7 @@ Now we are going to load our sample data into the S3 bucket that our Cloudformat 2. Now we are going to use the S3 bucket that you created with CloudFormation and upload the files. Go to the search bar at the top and type in `S3` and click on S3. There will be sample data in the bucket already, feel free to ignore it or use it for other modeling exploration. The bucket will be prefixed with `dbt-data-lake`. - - + 3. Click on the `name of the bucket` S3 bucket. If you have multiple S3 buckets, this will be the bucket that was listed under “Workshopbucket” on the Outputs page. diff --git a/website/docs/guides/migration/tools/refactoring-legacy-sql.md b/website/docs/guides/refactoring-legacy-sql.md similarity index 92% rename from website/docs/guides/migration/tools/refactoring-legacy-sql.md rename to website/docs/guides/refactoring-legacy-sql.md index 9dd66abb495..a339e523020 100644 --- a/website/docs/guides/migration/tools/refactoring-legacy-sql.md +++ b/website/docs/guides/refactoring-legacy-sql.md @@ -2,15 +2,24 @@ title: Refactoring legacy SQL to dbt id: refactoring-legacy-sql description: This guide walks through refactoring a long SQL query (perhaps from a stored procedure) into modular dbt data models. +displayText: Creating new materializations +hoverSnippet: Learn how to refactoring a long SQL query into modular dbt data models. +# time_to_complete: '30 minutes' commenting out until we test +platform: 'dbt-cloud' +icon: 'guides' +hide_table_of_contents: true +tags: ['SQL'] +level: 'Advanced' +recently_updated: true --- -You may have already learned how to build dbt models from scratch. +## Introduction -But in reality, you probably already have some queries or stored procedures that power analyses and dashboards, and now you’re wondering how to port those into dbt. +You may have already learned how to build dbt models from scratch. But in reality, you probably already have some queries or stored procedures that power analyses and dashboards, and now you’re wondering how to port those into dbt. There are two parts to accomplish this: migration and refactoring. In this guide we’re going to learn a process to help us turn legacy SQL code into modular dbt models. -When migrating and refactoring code, it’s of course important to stay organized. We'll do this by following several steps (jump directly from the right sidebar): +When migrating and refactoring code, it’s of course important to stay organized. We'll do this by following several steps: 1. Migrate your code 1:1 into dbt 2. Implement dbt sources rather than referencing raw database tables @@ -21,9 +30,10 @@ When migrating and refactoring code, it’s of course important to stay organize Let's get into it! -:::info More resources. -This guide is excerpted from the new dbt Learn On-demand Course, "Refactoring SQL for Modularity" - if you're curious, pick up the [free refactoring course here](https://courses.getdbt.com/courses/refactoring-sql-for-modularity), which includes example and practice refactoring projects. Or for a more in-depth look at migrating DDL and DML from stored procedures check out [this guide](/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures). +:::info More resources +This guide is excerpted from the new dbt Learn On-demand Course, "Refactoring SQL for Modularity" - if you're curious, pick up the [free refactoring course here](https://courses.getdbt.com/courses/refactoring-sql-for-modularity), which includes example and practice refactoring projects. Or for a more in-depth look at migrating DDL and DML from stored procedures, refer to the[Migrate from stored procedures](/guides/migrate-from-stored-procedures) guide. ::: + ## Migrate your existing SQL code @@ -38,7 +48,7 @@ To get going, you'll copy your legacy SQL query into your dbt project, by saving Once you've copied it over, you'll want to `dbt run` to execute the query and populate the in your warehouse. -If this is your first time running dbt, you may want to start with the [Introduction to dbt](/docs/introduction) and the earlier sections of the [quickstart guide](/quickstarts) before diving into refactoring. +If this is your first time running dbt, you may want to start with the [Introduction to dbt](/docs/introduction) and the earlier sections of the [quickstart guide](/guides) before diving into refactoring. This step may sound simple, but if you're porting over an existing set of SQL transformations to a new SQL dialect, you will need to consider how your legacy SQL dialect differs from your new SQL flavor, and you may need to modify your legacy code to get it to run at all. @@ -59,7 +69,7 @@ This allows you to call the same table in multiple places with `{{ src('my_sourc We start here for several reasons: #### Source freshness reporting -Using sources unlocks the ability to run [source freshness reporting](docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale. +Using sources unlocks the ability to run [source freshness reporting](/docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale. #### Easy dependency tracing If you're migrating multiple stored procedures into dbt, with sources you can see which queries depend on the same raw tables. @@ -206,7 +216,7 @@ This allows anyone after us to easily step through the CTEs when troubleshooting ## Port CTEs to individual data models Rather than keep our SQL code confined to one long SQL file, we'll now start splitting it into modular + reusable [dbt data models](https://docs.getdbt.com/docs/build/models). -Internally at dbt Labs, we follow roughly this [data modeling technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) and we [structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) accordingly. +Internally at dbt Labs, we follow roughly this [data modeling technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) and we [structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) accordingly. We'll follow those structures in this walkthrough, but your team's conventions may of course differ from ours. @@ -243,7 +253,7 @@ Under the hood, it generates comparison queries between our before and after sta Sure, we could write our own query manually to audit these models, but using the dbt `audit_helper` package gives us a head start and allows us to identify variances more quickly. -## Ready for refactoring practice? +### Ready for refactoring practice? Head to the free on-demand course, [Refactoring from Procedural SQL to dbt](https://courses.getdbt.com/courses/refactoring-sql-for-modularity) for a more in-depth refactoring example + a practice refactoring problem to test your skills. Questions on this guide or the course? Drop a note in #learn-on-demand in [dbt Community Slack](https://getdbt.com/community). diff --git a/website/docs/guides/orchestration/webhooks/serverless-datadog.md b/website/docs/guides/serverless-datadog.md similarity index 66% rename from website/docs/guides/orchestration/webhooks/serverless-datadog.md rename to website/docs/guides/serverless-datadog.md index cb03c72c6b5..931ba9832ab 100644 --- a/website/docs/guides/orchestration/webhooks/serverless-datadog.md +++ b/website/docs/guides/serverless-datadog.md @@ -1,62 +1,71 @@ --- title: "Create Datadog events from dbt Cloud results" -id: webhooks-guide-serverless-datadog -slug: serverless-datadog -description: Configure a serverless app to add Datadog logs +id: serverless-datadog +description: Configure a serverless app to add dbt Cloud events to Datadog logs. +hoverSnippet: Learn how to configure a serverless app to add dbt Cloud events to Datadog logs. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- -This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-models). +## Introduction + +This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-job-models). In this example, we will use [fly.io](https://fly.io) for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run). -## Prerequisites +### Prerequisites + This guide assumes some familiarity with: - [dbt Cloud Webhooks](/docs/deploy/webhooks) - CLI apps - Deploying code to a serverless code runner like fly.io or AWS Lambda -## Integration steps - -### 1. Clone the `dbt-cloud-webhooks-datadog` repo +## Clone the `dbt-cloud-webhooks-datadog` repo [This repository](https://github.com/dpguthrie/dbt-cloud-webhooks-datadog) contains the sample code for validating a webhook and creating logs in Datadog. -### 2. Install `flyctl` and sign up for fly.io +## Install `flyctl` and sign up for fly.io -Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: +Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: Switch to the directory containing the repo you cloned in step 1: -```shell -#example: replace with your actual path -cd ~/Documents/GitHub/dbt-cloud-webhooks-datadog -``` + + ```shell + #example: replace with your actual path + cd ~/Documents/GitHub/dbt-cloud-webhooks-datadog + ``` Sign up for fly.io: -```shell -flyctl auth signup -``` + ```shell + flyctl auth signup + ``` Your console should show `successfully logged in as YOUR_EMAIL` when you're done, but if it doesn't then sign in to fly.io from your command line: -```shell -flyctl auth login -``` + ```shell + flyctl auth login + ``` + +## Launch your fly.io app -### 3. Launch your fly.io app Launching your app publishes it to the web and makes it ready to catch webhook events: -```shell -flyctl launch -``` + ```shell + flyctl launch + ``` -You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app. +1. You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app. -Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-datadog`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes. +2. Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-datadog`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes. -Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`). +3. Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`). -When asked if you would like to set up Postgresql or Redis databases, type `n` for each. +4. When asked if you would like to set up Postgresql or Redis databases, type `n` for each. -Type `y` when asked if you would like to deploy now. +5. Type `y` when asked if you would like to deploy now.
      Sample output from the setup wizard: @@ -86,16 +95,16 @@ Wrote config file fly.toml
      ### 4. Create a Datadog API Key [Create an API Key for your Datadog account](https://docs.datadoghq.com/account_management/api-app-keys/) and make note of it and your Datadog site (e.g. `datadoghq.com`) for later. -### 5. Configure a new webhook in dbt Cloud -See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**. - -Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`) +## Configure a new webhook in dbt Cloud -Make note of the Webhook Secret Key for later. +1. See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**. +2. Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`). +3. Make note of the Webhook Secret Key for later. *Do not test the endpoint*; it won't work until you have stored the auth keys (next step) -### 6. Store secrets +## Store secrets + The application requires four secrets to be set, using these names: - `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. - `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier. @@ -103,9 +112,10 @@ The application requires four secrets to be set, using these names: - `DD_SITE`: The Datadog site for your organisation, e.g. `datadoghq.com`. Set these secrets as follows, replacing `abc123` etc with actual values: -```shell -flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 DD_API_KEY=ghi789 DD_SITE=datadoghq.com -``` + ```shell + flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 DD_API_KEY=ghi789 DD_SITE=datadoghq.com + ``` + +## Deploy your app -### 7. Deploy your app -After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. \ No newline at end of file +After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. diff --git a/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md b/website/docs/guides/serverless-pagerduty.md similarity index 87% rename from website/docs/guides/orchestration/webhooks/serverless-pagerduty.md rename to website/docs/guides/serverless-pagerduty.md index 5455af60110..50cc1b2b36e 100644 --- a/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md +++ b/website/docs/guides/serverless-pagerduty.md @@ -1,10 +1,18 @@ --- -title: "Create PagerDuty alarms from failed dbt Cloud tasks" -id: webhooks-guide-serverless-pagerduty -slug: serverless-pagerduty -description: Configure a serverless app to create PagerDuty alarms +title: "Trigger PagerDuty alarms when dbt Cloud jobs fail" +id: serverless-pagerduty +description: Use webhooks to configure a serverless app to trigger PagerDuty alarms. +hoverSnippet: Learn how to configure a serverless app that uses webhooks to trigger PagerDuty alarms. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- +## Introduction + This guide will teach you how to build and host a basic Python app which will monitor dbt Cloud jobs and create PagerDuty alarms based on failure. To do this, when a dbt Cloud job completes it will: - Check for any failed nodes (e.g. non-passing tests or errored models), and - create a PagerDuty alarm based on those nodes by calling the PagerDuty Events API. Events are deduplicated per run ID. @@ -13,20 +21,20 @@ This guide will teach you how to build and host a basic Python app which will mo In this example, we will use fly.io for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run). -## Prerequisites +### Prerequisites + This guide assumes some familiarity with: - [dbt Cloud Webhooks](/docs/deploy/webhooks) - CLI apps - Deploying code to a serverless code runner like fly.io or AWS Lambda -## Integration steps -### 1. Clone the `dbt-cloud-webhooks-pagerduty` repo +## Clone the `dbt-cloud-webhooks-pagerduty` repo [This repository](https://github.com/dpguthrie/dbt-cloud-webhooks-pagerduty) contains the sample code for validating a webhook and creating events in PagerDuty. -### 2. Install `flyctl` and sign up for fly.io +## Install `flyctl` and sign up for fly.io Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: @@ -46,7 +54,7 @@ Your console should show `successfully logged in as YOUR_EMAIL` when you're done flyctl auth login ``` -### 3. Launch your fly.io app +## Launch your fly.io app Launching your app publishes it to the web and makes it ready to catch webhook events: ```shell flyctl launch @@ -87,12 +95,12 @@ Wrote config file fly.toml
      -### 4. Create a PagerDuty integration application +## Create a PagerDuty integration application See [PagerDuty's guide](https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgw-events-api-v2-overview#getting-started) for full instructions. Make note of the integration key for later. -### 5. Configure a new webhook in dbt Cloud +## Configure a new webhook in dbt Cloud See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**. Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`) @@ -101,7 +109,7 @@ Make note of the Webhook Secret Key for later. *Do not test the endpoint*; it won't work until you have stored the auth keys (next step) -### 6. Store secrets +## Store secrets The application requires three secrets to be set, using these names: - `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. - `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier. @@ -112,5 +120,6 @@ Set these secrets as follows, replacing `abc123` etc with actual values: flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 PD_ROUTING_KEY=ghi789 ``` -### 7. Deploy your app -After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. \ No newline at end of file +## Deploy your app + +After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. diff --git a/website/docs/guides/set-up-ci.md b/website/docs/guides/set-up-ci.md new file mode 100644 index 00000000000..83362094ec6 --- /dev/null +++ b/website/docs/guides/set-up-ci.md @@ -0,0 +1,355 @@ +--- +title: "Get started with Continuous Integration tests" +description: Implement a CI environment for safe project validation. +hoverSnippet: Learn how to implement a CI environment for safe project validation. +id: set-up-ci +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['dbt Cloud', 'Orchestration', 'CI'] +level: 'Intermediate' +recently_updated: true +--- + +## Introduction + +By validating your code _before_ it goes into production, you don't need to spend your afternoon fielding messages from people whose reports are suddenly broken. + +A solid CI setup is critical to preventing avoidable downtime and broken trust. dbt Cloud uses **sensible defaults** to get you up and running in a performant and cost-effective way in minimal time. + +After that, there's time to get fancy, but let's walk before we run. + +In this guide, we're going to add a **CI environment**, where proposed changes can be validated in the context of the entire project without impacting production systems. We will use a single set of deployment credentials (like the Prod environment), but models are built in a separate location to avoid impacting others (like the Dev environment). + +Your git flow will look like this: + + +### Prerequisites + +As part of your initial dbt Cloud setup, you should already have Development and Production environments configured. Let's recap what each does: + +- Your **Development environment** powers the IDE. Each user has individual credentials, and builds into an individual dev schema. Nothing you do here impacts any of your colleagues. +- Your **Production environment** brings the canonical version of your project to life for downstream consumers. There is a single set of deployment credentials, and everything is built into your production schema(s). + +## Create a new CI environment + +See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **CI**. Just like your existing Production environment, it will be a Deployment-type environment. + +When setting a Schema in the **Deployment Credentials** area, remember that dbt Cloud will automatically generate a custom schema name for each PR to ensure that they don't interfere with your deployed models. This means you can safely set the same Schema name as your Production job. + +### 1. Double-check your Production environment is identified + +Go into your existing Production environment, and ensure that the **Set as Production environment** checkbox is set. It'll make things easier later. + +### 2. Create a new job in the CI environment + +Use the **Continuous Integration Job** template, and call the job **CI Check**. + +In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down: + +- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped. +- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs. + +To be able to find modified nodes, dbt needs to have something to compare against. dbt Cloud uses the last successful run of any job in your Production environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). As long as you identified your Production environment in Step 2, you won't need to touch this. If you didn't, pick the right environment from the dropdown. + +### 3. Test your process + +That's it! There are other steps you can take to be even more confident in your work, such as validating your structure follows best practices and linting your code. For more information, refer to [Get started with Continuous Integration tests](/guides/set-up-ci). + +To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft). Within a few seconds, you’ll see a new check appear in your git provider. + +### Things to keep in mind + +- If you make a new commit while a CI run based on older code is in progress, it will be automatically canceled and replaced with the fresh code. +- An unlimited number of CI jobs can run at once. If 10 developers all commit code to different PRs at the same time, each person will get their own schema containing their changes. Once each PR is merged, dbt Cloud will drop that schema. +- CI jobs will never block a production run. + +## Enforce best practices with dbt project evaluator + +dbt Project Evaluator is a package designed to identify deviations from best practices common to many dbt projects, including modeling, testing, documentation, structure and performance problems. For an introduction to the package, read its [launch blog post](/blog/align-with-dbt-project-evaluator). + +### 1. Install the package + +As with all packages, add a reference to `dbt-labs/dbt_project_evaluator` to your `packages.yml` file. See the [dbt Package Hub](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest/) for full installation instructions. + +### 2. Define test severity with an environment variable + +As noted in the [documentation](https://dbt-labs.github.io/dbt-project-evaluator/latest/ci-check/), tests in the package are set to `warn` severity by default. + +To have these tests fail in CI, create a new environment called `DBT_PROJECT_EVALUATOR_SEVERITY`. Set the project-wide default to `warn`, and set it to `error` in the CI environment. + +In your `dbt_project.yml` file, override the severity configuration: + +```yaml +tests: +dbt_project_evaluator: + +severity: "{{ env_var('DBT_PROJECT_EVALUATOR_SEVERITY', 'warn') }}" +``` + +### 3. Update your CI commands + +Because these tests should only run after the rest of your project has been built, your existing CI command will need to be updated to exclude the dbt_project_evaluator package. You will then add a second step which builds _only_ the package's models and tests. + +Update your steps to: + +```bash +dbt build --select state:modified+ --exclude package:dbt_project_evaluator +dbt build --select package:dbt_project_evaluator +``` + +### 4. Apply any customizations + +Depending on the state of your project when you roll out the evaluator, you may need to skip some tests or allow exceptions for some areas. To do this, refer to the documentation on: + +- [disabling tests](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/customization/) +- [excluding groups of models from a specific test](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/exceptions/) +- [excluding packages or sources/models based on path](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/excluding-packages-and-paths/) + +If you create a seed to exclude groups of models from a specific test, remember to disable the default seed and include `dbt_project_evaluator_exceptions` in your second `dbt build` command above. + +## Run linting checks with SQLFluff + +By [linting](/docs/cloud/dbt-cloud-ide/lint-format#lint) your project during CI, you can ensure that code styling standards are consistently enforced, without spending human time nitpicking comma placement. + +The steps below create an action/pipeline which uses [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. If you don't already have SQLFluff rules defined, check out [our recommended config file](/best-practices/how-we-style/2-how-we-style-our-sql). + +### 1. Create a YAML file to define your pipeline + +The YAML files defined below are what tell your code hosting platform the steps to run. In this setup, you’re telling the platform to run a SQLFluff lint job every time a commit is pushed. + + + + +GitHub Actions are defined in the `.github/workflows` directory. To define the job for your action, add a new file named `lint_on_push.yml` under the `workflows` folder. Your final folder structure will look like this: + +```sql +my_awesome_project +├── .github +│ ├── workflows +│ │ └── lint_on_push.yml +``` + +**Key pieces:** + +- `on:` defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other trigger options, check out [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). +- `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job +- `uses:` - When the Ubuntu server is created, it is completely empty. [`checkout`](https://github.com/actions/checkout#checkout-v3) and [`setup-python`](https://github.com/actions/setup-python#setup-python-v3) are public GitHub Actions which enable the server to access the code in your repo, and set up Python correctly. +- `run:` - these steps are run at the command line, as though you typed them at a prompt yourself. This will install sqlfluff and lint the project. Be sure to set the correct `--dialect` for your project. + +For a full breakdown of the properties in a workflow file, see [Understanding the workflow file](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on GitHub's website. + +```yaml +name: lint dbt project on push + +on: + push: + branches-ignore: + - 'main' + +jobs: + # this job runs SQLFluff with a specific set of rules + # note the dialect is set to Snowflake, so make that specific to your setup + # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html + lint_project: + name: Run SQLFluff linter + runs-on: ubuntu-latest + + steps: + - uses: "actions/checkout@v3" + - uses: "actions/setup-python@v4" + with: + python-version: "3.9" + - name: Install SQLFluff + run: "pip install sqlfluff" + - name: Lint project + run: "sqlfluff lint models --dialect snowflake" + +``` + + + + +Create a `.gitlab-ci.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file. + +```sql +my_awesome_project +├── dbt_project.yml +├── .gitlab-ci.yml +``` + +**Key pieces:** + +- `image: python:3.9` - this defines the virtual image we’re using to run the job +- `rules:` - defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other rules, refer to [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules). +- `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above. + +```yaml +image: python:3.9 + +stages: + - pre-build + +# this job runs SQLFluff with a specific set of rules +# note the dialect is set to Snowflake, so make that specific to your setup +# details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html +lint-project: + stage: pre-build + rules: + - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main' + script: + - pip install sqlfluff + - sqlfluff lint models --dialect snowflake +``` + + + + +Create a `bitbucket-pipelines.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file. + +```sql +my_awesome_project +├── bitbucket-pipelines.yml +├── dbt_project.yml +``` + +**Key pieces:** + +- `image: python:3.11.1` - this defines the virtual image we’re using to run the job +- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `main`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/) +- `script:` - this is how we’re telling the Bitbucket runner to execute the Python script we defined above. + +```yaml +image: python:3.11.1 + + +pipelines: + branches: + '**': # this sets a wildcard to run on every branch + - step: + name: Lint dbt project + script: + - pip install sqlfluff==0.13.1 + - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 + + 'main': # override if your default branch doesn't run on a branch named "main" + - step: + script: + - python --version +``` + + + + +### 2. Commit and push your changes to make sure everything works + +After you finish creating the YAML files, commit and push your code to trigger your pipeline for the first time. If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run. + + + + +In your repository, click the _Actions_ tab + +![Image showing the GitHub action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-github.png) + +Sample output from SQLFluff in the `Run SQLFluff linter` job: + +![Image showing the logs in GitHub for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-github.png) + + + + +In the menu option go to *CI/CD > Pipelines* + +![Image showing the GitLab action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-gitlab.png) + +Sample output from SQLFluff in the `Run SQLFluff linter` job: + +![Image showing the logs in GitLab for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-gitlab.png) + + + + +In the left menu pane, click on *Pipelines* + +![Image showing the Bitbucket action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png) + +Sample output from SQLFluff in the `Run SQLFluff linter` job: + +![Image showing the logs in Bitbucket for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png) + + + + +## Advanced: Create a release train with additional environments + +Large and complex enterprises sometimes require additional layers of validation before deployment. Learn how to add these checks with dbt Cloud. + +:::caution Are you sure you need this? +This approach can increase release safety, but creates additional manual steps in the deployment process as well as a greater maintenance burden. + +As such, it may slow down the time it takes to get new features into production. + +The team at Sunrun maintained a SOX-compliant deployment in dbt while reducing the number of environments. Check out [their Coalesce presentation](https://www.youtube.com/watch?v=vmBAO2XN-fM) to learn more. +::: + +In this section, we will add a new **QA** environment. New features will branch off from and be merged back into the associated `qa` branch, and a member of your team (the "Release Manager") will create a PR against `main` to be validated in the CI environment before going live. + +The git flow will look like this: + + +### Advanced prerequisites + +- You have the **Development**, **CI**, and **Production** environments, as described in [the Baseline setup](/guides/set-up-ci). + +### 1. Create a `release` branch in your git repo + +As noted above, this branch will outlive any individual feature, and will be the base of all feature development for a period of time. Your team might choose to create a new branch for each sprint (`qa/sprint-01`, `qa/sprint-02`, etc), tie it to a version of your data product (`qa/1.0`, `qa/1.1`), or just have a single `qa` branch which remains active indefinitely. + +### 2. Update your Development environment to use the `qa` branch + +See [Custom branch behavior](/docs/dbt-cloud-environments#custom-branch-behavior). Setting `qa` as your custom branch ensures that the IDE creates new branches and PRs with the correct target, instead of using `main`. + + + +### 3. Create a new QA environment + +See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **QA**. Just like your existing Production and CI environments, it will be a Deployment-type environment. + +Set its branch to `qa` as well. + +### 4. Create a new job + +Use the **Continuous Integration Job** template, and call the job **QA Check**. + +In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down: + +- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped. +- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs. + +To be able to find modified nodes, dbt needs to have something to compare against. Normally, we use the Production environment as the source of truth, but in this case there will be new code merged into `qa` long before it hits the `main` branch and Production environment. Because of this, we'll want to defer the Release environment to itself. + +### Optional: also add a compile-only job + +dbt Cloud uses the last successful run of any job in that environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). If you have a lot of PRs in flight, the comparison state could switch around regularly. + +Adding a regularly-scheduled job inside of the QA environment whose only command is `dbt compile` can regenerate a more stable manifest for comparison purposes. + +### 5. Test your process + +When the Release Manager is ready to cut a new release, they will manually open a PR from `qa` into `main` from their git provider (e.g. GitHub, GitLab, Azure DevOps). dbt Cloud will detect the new PR, at which point the existing check in the CI environment will trigger and run. When using the [baseline configuration](/guides/set-up-ci), it's possible to kick off the PR creation from inside of the dbt Cloud IDE. Under this paradigm, that button will create PRs targeting your QA branch instead. + +To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft) against your `qa` branch. You'll see the integration tests begin to run. Once they complete, manually create a PR against `main`, and within a few seconds you’ll see the tests run again but this time incorporating all changes from all code that hasn't been merged to main yet. diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md b/website/docs/guides/set-up-your-databricks-dbt-project.md similarity index 81% rename from website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md rename to website/docs/guides/set-up-your-databricks-dbt-project.md index b0be39a4273..c17c6a1f99e 100644 --- a/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md +++ b/website/docs/guides/set-up-your-databricks-dbt-project.md @@ -1,5 +1,18 @@ -# How to set up your Databricks and dbt project - +--- +title: Set up your dbt project with Databricks +id: set-up-your-databricks-dbt-project +description: "Learn more about setting up your dbt project with Databricks." +displayText: Setting up your dbt project with Databricks +hoverSnippet: Learn how to set up your dbt project with Databricks. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'databricks' +hide_table_of_contents: true +tags: ['Databricks', 'dbt Core','dbt Cloud'] +level: 'Intermediate' +recently_updated: true +--- + +## Introduction Databricks and dbt Labs are partnering to help data teams think like software engineering teams and ship trusted data, faster. The dbt-databricks adapter enables dbt users to leverage the latest Databricks features in their dbt project. Hundreds of customers are now using dbt and Databricks to build expressive and reliable data pipelines on the Lakehouse, generating data assets that enable analytics, ML, and AI use cases throughout the business. @@ -7,7 +20,7 @@ In this guide, we discuss how to set up your dbt project on the Databricks Lakeh ## Configuring the Databricks Environments -To get started, we will use Databricks’s Unity Catalog. Without it, we would not be able to design separate [environments](https://docs.getdbt.com/docs/collaborate/environments) for development and production per our [best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). It also allows us to ensure the proper access controls have been applied using SQL. You will need to be using the dbt-databricks adapter to use it (as opposed to the dbt-spark adapter). +To get started, we will use Databricks’s Unity Catalog. Without it, we would not be able to design separate [environments](https://docs.getdbt.com/docs/collaborate/environments) for development and production per our [best practices](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview). It also allows us to ensure the proper access controls have been applied using SQL. You will need to be using the dbt-databricks adapter to use it (as opposed to the dbt-spark adapter). We will set up two different *catalogs* in Unity Catalog: **dev** and **prod**. A catalog is a top-level container for *schemas* (previously known as databases in Databricks), which in turn contain tables and views. @@ -33,7 +46,7 @@ Service principals are used to remove humans from deploying to production for co [Let’s create a service principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account) in Databricks: 1. Have your Databricks Account admin [add a service principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account) to your account. The service principal’s name should differentiate itself from a user ID and make its purpose clear (eg dbt_prod_sp). -2. Add the service principal added to any groups it needs to be a member of at this time. There are more details on permissions in our ["Unity Catalog best practices" guide](dbt-unity-catalog-best-practices). +2. Add the service principal added to any groups it needs to be a member of at this time. There are more details on permissions in our ["Unity Catalog best practices" guide](/best-practices/dbt-unity-catalog-best-practices). 3. [Add the service principal to your workspace](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace) and apply any [necessary entitlements](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace-using-the-admin-console), such as Databricks SQL access and Workspace access. ## Setting up Databricks Compute @@ -55,13 +68,13 @@ We are not covering python in this post but if you want to learn more, check out Now that the Databricks components are in place, we can configure our dbt project. This involves connecting dbt to our Databricks SQL warehouse to run SQL queries and using a version control system like GitHub to store our transformation code. -If you are migrating an existing dbt project from the dbt-spark adapter to dbt-databricks, follow this [migration guide](https://docs.getdbt.com/guides/migration/tools/migrating-from-spark-to-databricks#migration) to switch adapters without needing to update developer credentials and other existing configs. +If you are migrating an existing dbt project from the dbt-spark adapter to dbt-databricks, follow this [migration guide](/guides/migrate-from-spark-to-databricks) to switch adapters without needing to update developer credentials and other existing configs. -If you’re starting a new dbt project, follow the steps below. For a more detailed setup flow, check out our [quickstart guide.](/quickstarts/databricks) +If you’re starting a new dbt project, follow the steps below. For a more detailed setup flow, check out our [quickstart guide.](/guides/databricks) ### Connect dbt to Databricks -First, you’ll need to connect your dbt project to Databricks so it can send transformation instructions and build objects in Unity Catalog. Follow the instructions for [dbt Cloud](/quickstarts/databricks?step=4) or [Core](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to configure your project’s connection credentials. +First, you’ll need to connect your dbt project to Databricks so it can send transformation instructions and build objects in Unity Catalog. Follow the instructions for [dbt Cloud](/guides/databricks?step=4) or [Core](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to configure your project’s connection credentials. Each developer must generate their Databricks PAT and use the token in their development credentials. They will also specify a unique developer schema that will store the tables and views generated by dbt runs executed from their IDE. This provides isolated developer environments and ensures data access is fit for purpose. @@ -80,11 +93,11 @@ For your development credentials/profiles.yml: During your first invocation of `dbt run`, dbt will create the developer schema if it doesn't already exist in the dev catalog. -### Defining your dbt deployment environment +## Defining your dbt deployment environment -Last, we need to give dbt a way to deploy code outside of development environments. To do so, we’ll use dbt [environments](https://docs.getdbt.com/docs/collaborate/environments) to define the production targets that end users will interact with. +We need to give dbt a way to deploy code outside of development environments. To do so, we’ll use dbt [environments](https://docs.getdbt.com/docs/collaborate/environments) to define the production targets that end users will interact with. -Core projects can use [targets in profiles](https://docs.getdbt.com/docs/core/connection-profiles#understanding-targets-in-profiles) to separate environments. [dbt Cloud environments](https://docs.getdbt.com/docs/cloud/develop-in-the-cloud#set-up-and-access-the-cloud-ide) allow you to define environments via the UI and [schedule jobs](/quickstarts/databricks#create-and-run-a-job) for specific environments. +Core projects can use [targets in profiles](https://docs.getdbt.com/docs/core/connection-profiles#understanding-targets-in-profiles) to separate environments. [dbt Cloud environments](https://docs.getdbt.com/docs/cloud/develop-in-the-cloud#set-up-and-access-the-cloud-ide) allow you to define environments via the UI and [schedule jobs](/guides/databricks#create-and-run-a-job) for specific environments. Let’s set up our deployment environment: @@ -94,10 +107,10 @@ Let’s set up our deployment environment: 4. Set the schema to the default for your prod environment. This can be overridden by [custom schemas](https://docs.getdbt.com/docs/build/custom-schemas#what-is-a-custom-schema) if you need to use more than one. 5. Provide your Service Principal token. -### Connect dbt to your git repository +## Connect dbt to your git repository -Next, you’ll need somewhere to store and version control your code that allows you to collaborate with teammates. Connect your dbt project to a git repository with [dbt Cloud](/quickstarts/databricks#set-up-a-dbt-cloud-managed-repository). [Core](/quickstarts/manual-install#create-a-repository) projects will use the git CLI. +Next, you’ll need somewhere to store and version control your code that allows you to collaborate with teammates. Connect your dbt project to a git repository with [dbt Cloud](/guides/databricks#set-up-a-dbt-cloud-managed-repository). [Core](/guides/manual-install#create-a-repository) projects will use the git CLI. -## Next steps +### Next steps -Now that your project is configured, you can start transforming your Databricks data with dbt. To help you scale efficiently, we recommend you follow our best practices, starting with the ["Unity Catalog best practices" guide](dbt-unity-catalog-best-practices). +Now that your project is configured, you can start transforming your Databricks data with dbt. To help you scale efficiently, we recommend you follow our best practices, starting with the [Unity Catalog best practices](/best-practices/dbt-unity-catalog-best-practices), then you can [Optimize dbt models on Databricks](/guides/optimize-dbt-models-on-databricks). diff --git a/website/docs/guides/sl-migration.md b/website/docs/guides/sl-migration.md new file mode 100644 index 00000000000..0cfde742af2 --- /dev/null +++ b/website/docs/guides/sl-migration.md @@ -0,0 +1,135 @@ +--- +title: "Legacy dbt Semantic Layer migration guide" +id: "sl-migration" +description: "Learn how to migrate from the legacy dbt Semantic Layer to the latest one." +hoverSnippet: Migrate from the legacy dbt Semantic Layer to the latest one. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Semantic Layer','Migration'] +level: 'Intermediate' +recently_updated: true +--- + +## Introduction + +The legacy Semantic Layer will be deprecated in H2 2023. Additionally, the `dbt_metrics` package will not be supported in dbt v1.6 and later. If you are using `dbt_metrics`, you'll need to upgrade your configurations before upgrading to v1.6. This guide is for people who have the legacy dbt Semantic Layer setup and would like to migrate to the new dbt Semantic Layer. The estimated migration time is two weeks. + + +## Migrate metric configs to the new spec + +The metrics specification in dbt Core is changed in v1.6 to support the integration of MetricFlow. It's strongly recommended that you refer to [Build your metrics](/docs/build/build-metrics-intro) and before getting started so you understand the core concepts of the Semantic Layer. + +dbt Labs recommends completing these steps in a local dev environment (such as the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation)) instead of the dbt Cloud IDE: + +1. Create new Semantic Model configs as YAML files in your dbt project.* +1. Upgrade the metrics configs in your project to the new spec.* +1. Delete your old metrics file or remove the `.yml` file extension so they're ignored at parse time. Remove the `dbt-metrics` package from your project. Remove any macros that reference `dbt-metrics`, like `metrics.calculate()`. Make sure that any packages you’re using don't have references to the old metrics spec. +1. Install the CLI with `pip install "dbt-metricflow[your_adapter_name]"`. For example: + + ```bash + pip install "dbt-metricflow[snowflake]" + ``` + **Note** - The MetricFlow CLI is not available in the IDE at this time. Support is coming soon. + +1. Run `dbt parse`. This parses your project and creates a `semantic_manifest.json` file in your target directory. MetricFlow needs this file to query metrics. If you make changes to your configs, you will need to parse your project again. +1. Run `mf list metrics` to view the metrics in your project. +1. Test querying a metric by running `mf query --metrics --group-by `. For example: + ```bash + mf query --metrics revenue --group-by metric_time + ``` +1. Run `mf validate-configs` to run semantic and warehouse validations. This ensures your configs are valid and the underlying objects exist in your warehouse. +1. Push these changes to a new branch in your repo. + +**To make this process easier, dbt Labs provides a [custom migration tool](https://github.com/dbt-labs/dbt-converter) that automates these steps for you. You can find installation instructions in the [README](https://github.com/dbt-labs/dbt-converter/blob/master/README.md). Derived metrics aren’t supported in the migration tool, and will have to be migrated manually.* + +## Audit metric values after the migration + +You might need to audit metric values during the migration to ensure that the historical values of key business metrics are the same. + +1. In the CLI, query the metric(s) and dimensions you want to test and include the `--explain` option. For example: + ```bash + mf query --metrics orders,revenue --group-by metric_time__month,customer_type --explain + ``` +1. Use SQL MetricFlow to create a temporary model in your project, like `tmp_orders_revenue audit.sql`. You will use this temporary model to compare against your legacy metrics. +1. If you haven’t already done so, create a model using `metrics.calculate()` for the metrics you want to compare against. For example: + + ```bash + select * + from {{ metrics.calculate( + [metric('orders)', + metric('revenue)'], + grain='week', + dimensions=['metric_time', 'customer_type'], + ) }} + ``` + +1. Run the [dbt-audit](https://github.com/dbt-labs/dbt-audit-helper) helper on both models to compare the metric values. + +## Setup the Semantic Layer in a new environment + +This step is only relevant to users who want the legacy and new semantic layer to run in parallel for a short time. This will let you recreate content in downstream tools like Hex and Mode with minimal downtime. If you do not need to recreate assets in these tools skip to step 5. + +1. Create a new deployment environment in dbt Cloud and set the dbt version to 1.6 or higher. + +2. Select **Only run on a custom branch** and point to the branch that has the updated metric definition. + +3. Set the deployment schema to a temporary migration schema, such as `tmp_sl_migration`. Optional, you can create a new database for the migration. + +4. Create a job to parse your project, such as `dbt parse`, and run it. Make sure this job succeeds. There needs to be a successful job in your environment in order to set up the semantic layer. + +5. Select **Account Settings** -> **Projects** -> **Project details** and choose **Configure the Semantic Layer**. + +6. Under **Environment**, select the deployment environment you created in the previous step. Save your configuration. + +7. In the **Project details** page, click **Generate service token** and grant it **Semantic Layer Only** and **Metadata Only** permissions. Save this token securely. You will need it to connect to the semantic layer. + + +At this point, both the new semantic layer and the old semantic layer will be running. The new semantic layer will be pointing at your migration branch with the updated metrics definitions. + +## Update connection in downstream integrations + +Now that your Semantic Layer is set up, you will need to update any downstream integrations that used the legacy Semantic Layer. + +### Migration guide for Hex + +To learn more about integrating with Hex, check out their [documentation](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration#dbt-semantic-layer-integration) for more info. Additionally, refer to [dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. + +1. Set up a new connection for the Semantic Layer for your account. Something to note is that your old connection will still work. The following Loom video guides you in setting up your Semantic Layer with Hex: + + + +2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer. + +3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + + * **Note** — You will need to update your connection to your production environment once you merge your changes to main. Currently, this connection will be pointing at the semantic layer migration environment + +### Migration guide for Mode + +1. Set up a new connection for the semantic layer for your account. Follow [Mode's docs to setup your connection](https://mode.com/help/articles/supported-databases/#dbt-semantic-layer). + +2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer. + +3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + +## Merge your metrics migration branch to main, and upgrade your production environment to 1.6. + +1. Upgrade your production environment to 1.6 or higher. + * **Note** — The old metrics definitions are no longer valid so your dbt jobs will not pass. + +2. Merge your updated metrics definitions to main. **At this point the legacy semantic layer will no longer work.** + +If you created a new environment in [Step 3](#step-3-setup-the-semantic-layer-in-a-new-environment): + +3. Update your Environment in **Account Settings** -> **Project Details** -> **Edit Semantic Layer Configuration** to point to your production environment + +4. Delete your migration environment. Be sure to update your connection details in any downstream tools to account for the environment change. + +### Related docs + +- [MetricFlow quickstart guide](/docs/build/sl-getting-started) +- [Example dbt project](https://github.com/dbt-labs/jaffle-sl-template) +- [dbt metrics converter](https://github.com/dbt-labs/dbt-converter) +- [Why we're deprecating the dbt_metrics package](/blog/deprecating-dbt-metrics) blog post +- [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) diff --git a/website/docs/guides/sl-partner-integration-guide.md b/website/docs/guides/sl-partner-integration-guide.md new file mode 100644 index 00000000000..04f58f525bd --- /dev/null +++ b/website/docs/guides/sl-partner-integration-guide.md @@ -0,0 +1,165 @@ +--- +title: "Integrate with dbt Semantic Layer using best practices" +id: "sl-partner-integration-guide" +description: Learn about partner integration guidelines, roadmap, and connectivity. +hoverSnippet: Learn how to integrate with the Semantic Layer using best practices +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Semantic Layer','Best practices'] +level: 'Advanced' +recently_updated: true +--- + +## Introduction + +To fit your tool within the world of the Semantic Layer, dbt Labs offers some best practice recommendations for how to expose metrics and allow users to interact with them seamlessly. + +:::note +This is an evolving guide that is meant to provide recommendations based on our experience. If you have any feedback, we'd love to hear it! +::: + + +### Prerequisites + +To build a dbt Semantic Layer integration: + +- We offer a [JDBC](/docs/dbt-cloud-apis/sl-jdbc) API and [GraphQL API](/docs/dbt-cloud-apis/sl-graphql). Refer to the dedicated [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) for more technical integration details. + +- Familiarize yourself with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and [MetricFlow](/docs/build/about-metricflow)'s key concepts. There are two main objects: + + - [Semantic models](/docs/build/semantic-models) — Nodes in your semantic graph, connected via entities as edges. MetricFlow takes semantic models defined in YAML configuration files as inputs and creates a semantic graph that you can use to query metrics. + - [Metrics](/docs/build/metrics-overview) — Can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo). + +### Connection parameters + +The dbt Semantic Layer APIs authenticate with `environmentId`, `SERVICE_TOKEN`, and `host`. + +We recommend you provide users with separate input fields with these components for authentication (dbt Cloud will surface these parameters for the user). + +### Exposing metadata to dbt Labs + +When building an integration, we recommend you expose certain metadata in the request for analytics purposes. Among other items, it is helpful to have the following: + +- Your application's name (such as 'Tableau') +- The email of the person querying your application +- The version of dbt they are on. + + +## Use best practices when exposing metrics + +Best practices for exposing metrics are summarized into five themes: + +- [Governance](#governance-and-traceability) — Recommendations on how to establish guardrails for governed data work. +- [Discoverability](#discoverability) — Recommendations on how to make user-friendly data interactions. +- [Organization](#organization) — Organize metrics and dimensions for all audiences. +- [Query flexibility](#query-flexibility) — Allow users to query either one metric alone without dimensions or multiple metrics with dimensions. +- [Context and interpretation](#context-and-interpretation) — Contextualize metrics for better analysis; expose definitions, metadata, lineage, and freshness. + +### Governance and traceability + +When working with more governed data, it's essential to establish clear guardrails. Here are some recommendations: + +- **Aggregations control** — Users shouldn't generally be allowed to modify aggregations unless they perform post-processing calculations on Semantic Layer data (such as year-over-year analysis). + +- **Time series alignment and using metric_time** — Make sure users view metrics across the correct time series. When displaying metric graphs, using a non-default time aggregation dimension might lead to misleading interpretations. While users can still group by other time dimensions, they should be careful not to create trend lines with incorrect time axes.

      When looking at one or multiple metrics, users should use `metric_time` as the main time dimension to guarantee they are looking at the right time series for the metric(s).

      As such, when building an application, we recommend exposing `metric_time` as a separate, "special" time dimension on its own. This dimension is always going to align with all metrics and be common across them. Other time dimensions can still be looked at and grouped by, but having a clear delineation between the `metric_time` dimension and the other time dimensions is clarifying so that people do not confuse how metrics should be plotted.

      Also, when a user requests a time granularity change for the main time series, the query that your application runs should use `metric_time` as this will always give you the correct slice. Related to this, we also strongly recommend that you have a way to expose what dimension `metric_time` actually maps to for users who may not be familiar. Our APIs allow you to fetch the actual underlying time dimensions that makeup metric_time (such as `transaction_date`) so you can expose them to your users. + +- **Units consistency** — If units are supported, it's vital to avoid plotting data incorrectly with different units. Ensuring consistency in unit representation will prevent confusion and misinterpretation of the data. + +- **Traceability of metric and dimension changes** — When users change names of metrics and dimensions for reports, it's crucial to have a traceability mechanism in place to link back to the original source metric name. + + +### Discoverability + +- Consider treating [metrics](/docs/build/metrics-overview) as first-class objects rather than measures. Metrics offer a higher-level and more contextual way to interact with data, reducing the burden on end-users to manually aggregate data. + +- Easy metric interactions: Provide users with an intuitive approach to: + * Search for Metrics — Users should be able to easily search and find relevant metrics. Metrics can serve as the starting point to lead users into exploring dimensions. + * Search for Dimensions — Users should be able to query metrics with associated dimensions, allowing them to gain deeper insights into the data. + * Filter by Dimension Values — Expose and enable users to filter metrics based on dimension values, encouraging data analysis and exploration. + * Filter additional metadata — Allow users to filter metrics based on other available metadata, such as metric type and default time granularity. + +- Suggested Metrics: Ideally, the system should intelligently suggest relevant metrics to users based on their team's activities. This approach encourages user exposure, facilitates learning, and supports collaboration among team members. + +By implementing these recommendations, the data interaction process becomes more user-friendly, empowering users to gain valuable insights without the need for extensive data manipulation. + +### Organization + +We recommend organizing metrics and dimensions in ways that a non-technical user can understand the data model, without needing much context: + +- **Organizing Dimensions** — To help non-technical users understand the data model better, we recommend organizing dimensions based on the entity they originated from. For example, consider dimensions like `user__country` and `product__category`.

      You can create groups by extracting `user` and `product` and then nest the respective dimensions under each group. This way, dimensions align with the entity or semantic model they belong to and make them more user-friendly and accessible. + +- **Organizing Metrics** — The goal is to organize metrics into a hierarchy in our configurations, instead of presenting them in a long list.

      This hierarchy helps you organize metrics based on specific criteria, such as business unit or team. By providing this structured organization, users can find and navigate metrics more efficiently, enhancing their overall data analysis experience. + +### Query flexibility + +Allow users to query either one metric alone without dimensions or multiple metrics with dimensions. + +- Allow toggling between metrics/dimensions seamlessly. + +- Be clear on exposing what dimensions are queryable with what metrics and hide things that don’t apply. (Our APIs provide calls for you to get relevant dimensions for metrics, and vice versa). + +- Only expose time granularities (monthly, daily, yearly) that match the available metrics. + * For example, if a dbt model and its resulting semantic model have a monthly granularity, make sure querying data with a 'daily' granularity isn't available to the user. Our APIs have functionality that will help you surface the correct granularities + +- We recommend that time granularity is treated as a general time dimension-specific concept and that it can be applied to more than just the primary aggregation (or `metric_time`). Consider a situation where a user wants to look at `sales` over time by `customer signup month`; in this situation, having the ability to apply granularities to both time dimensions is crucial. Our APIs include information to fetch the granularities for the primary (metric_time) dimensions, as well as all time dimensions. You can treat each time dimension and granularity selection independently in your application. Note: Initially, as a starting point, it makes sense to only support `metric_time` or the primary time dimension, but we recommend expanding that as your solution evolves. + +- You should allow users to filter on date ranges and expose a calendar and nice presets for filtering these. + * For example, last 30 days, last week, and so on. + +### Context and interpretation + +For better analysis, it's best to have the context of the metrics close to where the analysis is happening. We recommend the following: + +- Expose business definitions of the metrics as well as logical definitions. + +- Expose additional metadata from the Semantic layer (measures, type parameters). + +- Use the [Discovery API](/docs/dbt-cloud-apis/discovery-api) to enhance the metric and build confidence in its accuracy: + * Check if the metric is fresh and when it was last updated. + * Include lineage information to understand the metric's origin. + +- Allow for creating other metadata that’s useful for the metric. We can provide some of this information in our configuration (Display name, Default Granularity for View, Default Time range), but there may be other metadata that your tool wants to provide to make the metric richer. + +### Transparency and using compile + +For transparency and additional context, we recommend you have an easy way for the user to obtain the SQL that MetricFlow generates. Depending on what API you are using, you can do this by using our `compile` parameter. This is incredibly powerful and emphasizes transparency and openness, particularly for technically inclined users. + + +### Where filters and optimization + +In the cases where our APIs support either a string or a filter list for the `where` clause, we always recommend that your application utilizes the filter list in order to gain maximum pushdown benefits. The `where` string may be more intuitive for users writing queries during testing, but it will not have the performance benefits of the filter list in a production environment. + +## Understand stages of an integration + +These are recommendations on how to evolve a Semantic Layer integration and not a strict runbook. + +**Stage 1 - The basic** +* Supporting and using [JDBC](/docs/dbt-cloud-apis/sl-jdbc) or [GraphQL](/docs/dbt-cloud-apis/sl-graphql) is the first step. Refer to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) for more technical details. + +**Stage 2 - More discoverability and basic querying** +* Support listing metrics defined in the project +* Listing available dimensions based on one or many metrics +* Querying defined metric values on their own or grouping by available dimensions +* Display metadata from [Discovery API](/docs/dbt-cloud-apis/discovery-api) and other context + +**Stage 3 - More querying flexibility and better user experience (UX)** +* More advanced filtering + * Time filters with good presets/calendar UX + * Filtering metrics on a pre-populated set of dimension values +* Make dimension values more user-friendly by organizing them effectively +* Intelligent filtering of metrics based on available dimensions and vice versa + +**Stage 4 - More custom user interface (UI) / Collaboration** +* A place where users can see all the relevant information about a given metric +* Organize metrics by hierarchy and more advanced search features (such as filter on the type of metric or other metadata) +* Use and expose more metadata +* Querying dimensions without metrics and other more advanced querying functionality +* Suggest metrics to users based on teams/identity, and so on. + + +### Related docs + +- [Use the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) to learn about the product. +- [Build your metrics](/docs/build/build-metrics-intro) for more info about MetricFlow and its components. +- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations. diff --git a/website/docs/quickstarts/snowflake-qs.md b/website/docs/guides/snowflake-qs.md similarity index 98% rename from website/docs/quickstarts/snowflake-qs.md rename to website/docs/guides/snowflake-qs.md index 6d03586e611..abb18276b97 100644 --- a/website/docs/quickstarts/snowflake-qs.md +++ b/website/docs/guides/snowflake-qs.md @@ -1,8 +1,9 @@ --- title: "Quickstart for dbt Cloud and Snowflake" id: "snowflake" -platform: 'dbt-cloud' +level: 'Beginner' icon: 'snowflake' +tags: ['dbt Cloud','Quickstart','Snowflake'] hide_table_of_contents: true --- ## Introduction @@ -35,8 +36,8 @@ You can also watch the [YouTube video on dbt and Snowflake](https://www.youtube. - Learn more with [dbt Courses](https://courses.getdbt.com/collections) - [How we configure Snowflake](https://blog.getdbt.com/how-we-configure-snowflake/) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) @@ -138,7 +139,7 @@ There are two ways to connect dbt Cloud to Snowflake. The first option is Partne -Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials. +Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials. 1. In the Snowflake UI, click on the home icon in the upper left corner. In the left sidebar, select **Admin**. Then, select **Partner Connect**. Find the dbt tile by scrolling or by searching for dbt in the search bar. Click the tile to connect to dbt. diff --git a/website/docs/quickstarts/starburst-galaxy-qs.md b/website/docs/guides/starburst-galaxy-qs.md similarity index 99% rename from website/docs/quickstarts/starburst-galaxy-qs.md rename to website/docs/guides/starburst-galaxy-qs.md index 33228710509..1822c83fa90 100644 --- a/website/docs/quickstarts/starburst-galaxy-qs.md +++ b/website/docs/guides/starburst-galaxy-qs.md @@ -1,9 +1,10 @@ --- title: "Quickstart for dbt Cloud and Starburst Galaxy" id: "starburst-galaxy" -platform: 'dbt-cloud' +level: 'Beginner' icon: 'starburst' hide_table_of_contents: true +tags: ['dbt Cloud','Quickstart'] --- ## Introduction diff --git a/website/docs/guides/advanced/using-jinja.md b/website/docs/guides/using-jinja.md similarity index 95% rename from website/docs/guides/advanced/using-jinja.md rename to website/docs/guides/using-jinja.md index 40cfd2af298..9f098bb637f 100644 --- a/website/docs/guides/advanced/using-jinja.md +++ b/website/docs/guides/using-jinja.md @@ -1,15 +1,25 @@ --- -title: "Using Jinja" +title: "Use Jinja to improve your SQL code" id: "using-jinja" +description: "Learn how to improve your SQL code using Jinja." +hoverSnippet: "Improve your SQL code with Jinja" +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Jinja', 'dbt Core'] +level: 'Advanced' +recently_updated: true --- +## Introduction + In this guide, we're going to take a common pattern used in SQL, and then use Jinja to improve our code. If you'd like to work through this query, add [this CSV](https://github.com/dbt-labs/jaffle_shop/blob/core-v1.0.0/seeds/raw_payments.csv) to the `seeds/` folder of your dbt project, and then execute `dbt seed`. While working through the steps of this model, we recommend that you have your compiled SQL open as well, to check what your Jinja compiles to. To do this: * **Using dbt Cloud:** Click the compile button to see the compiled SQL in the right hand pane -* **Using the dbt CLI:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once. +* **Using dbt Core:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once. ## Write the SQL without Jinja Consider a data model in which an `order` can have many `payments`. Each `payment` may have a `payment_method` of `bank_transfer`, `credit_card` or `gift_card`, and therefore each `order` can have multiple `payment_methods` diff --git a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md b/website/docs/guides/zapier-ms-teams.md similarity index 90% rename from website/docs/guides/orchestration/webhooks/zapier-ms-teams.md rename to website/docs/guides/zapier-ms-teams.md index aa95b999d4c..66596d590e0 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md +++ b/website/docs/guides/zapier-ms-teams.md @@ -1,11 +1,18 @@ --- title: "Post to Microsoft Teams when a job finishes" -id: webhooks-guide-zapier-ms-teams -slug: zapier-ms-teams -description: Use Zapier and the dbt Cloud API to post to Microsoft Teams +id: zapier-ms-teams +description: Use Zapier and dbt Cloud webhooks to post to Microsoft Teams when a job finishes running. +hoverSnippet: Learn how to use Zapier with dbt Cloud webhooks to post in Microsoft Teams when a job finishes running. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- +## Introduction -This guide will show you how to set up an integration between dbt Cloud jobs and Microsoft Teams using [dbt Cloud Webhooks](/docs/deploy/webhooks) and Zapier, similar to the [native Slack integration](/faqs/accounts/slack). +This guide will show you how to set up an integration between dbt Cloud jobs and Microsoft Teams using [dbt Cloud Webhooks](/docs/deploy/webhooks) and Zapier, similar to the [native Slack integration](/docs/deploy/job-notifications#slack-notifications). When a dbt Cloud job finishes running, the integration will: @@ -14,26 +21,28 @@ When a dbt Cloud job finishes running, the integration will: - Post a summary to a Microsoft Teams channel. ![Screenshot of a message in MS Teams showing a summary of a dbt Cloud run which failed](/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-ui.png) -## Prerequisites + +### Prerequisites In order to set up the integration, you should have familiarity with: - [dbt Cloud Webhooks](/docs/deploy/webhooks) - Zapier -## Integration steps -### 1. Set up the connection between Zapier and Microsoft Teams + +## Set up the connection between Zapier and Microsoft Teams * Install the [Zapier app in Microsoft Teams](https://appsource.microsoft.com/en-us/product/office/WA200002044) and [grant Zapier access to your account](https://zapier.com/blog/how-to-automate-microsoft-teams/). **Note**: To receive the message, add the Zapier app to the team's channel during installation. -### 2. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +## Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. ![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) ### 3. Configure a new webhook in dbt Cloud + See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Choose either **Run completed** or **Run errored**, but not both, or you'll get double messages when a run fails. Make note of the Webhook Secret Key for later. @@ -42,14 +51,15 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. -### 4. Store secrets +## Store secrets + In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). -### 5. Add a code action +## Add a code action Select **Code by Zapier** as the App, and **Run Python** as the Event. In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. @@ -141,19 +151,21 @@ for step in run_data_results['run_steps']: output = {'outcome_message': outcome_message} ``` -### 6. Add the Microsoft Teams action +## Add the Microsoft Teams action + Select **Microsoft Teams** as the App, and **Send Channel Message** as the Action. In the **Set up action** area, choose the team and channel. Set the **Message Text Format** to **markdown**, then put **2. Outcome Message** from the Run Python in Code by Zapier output into the **Message Text** field. ![Screenshot of the Zapier UI, showing the mappings of prior steps to an MS Teams message](/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-zap-config.png) -### 7. Test and deploy +## Test and deploy + As you have gone through each step, you should have tested the outputs, so you can now try posting a message into your Teams channel. When you're happy with it, remember to ensure that your `run_id` and `account_id` are no longer hardcoded, then publish your Zap. -## Other notes +### Other notes - If you post to a chat instead of a team channel, you don't need to add the Zapier app to Microsoft Teams. - If you post to a chat instead of a team channel, note that markdown is not supported and you will need to remove the markdown formatting. - If you chose the **Catch Hook** trigger instead of **Catch Raw Hook**, you will need to pass each required property from the webhook as an input instead of running `json.loads()` against the raw body. You will also need to remove the validation code. diff --git a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md b/website/docs/guides/zapier-new-cloud-job.md similarity index 87% rename from website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md rename to website/docs/guides/zapier-new-cloud-job.md index 49b01d0db7e..b16fa94bc21 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md +++ b/website/docs/guides/zapier-new-cloud-job.md @@ -1,28 +1,34 @@ --- title: "Trigger a dbt Cloud job after a run finishes" -id: webhooks-guide-zapier-new-cloud-job -slug: zapier-new-cloud-job -description: Use Zapier to interact with the dbt Cloud API +id: zapier-new-cloud-job +description: Use Zapier to trigger a dbt Cloud job once a run completes. +hoverSnippet: Learn how to use Zapier to trigger a dbt Cloud job once a run completes. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- +## Introduction + This guide will show you how to trigger a dbt Cloud job based on the successful completion of a different job. This can be useful when you need to trigger a job in a different project. Remember that dbt works best when it understands the whole context of the it has been asked to run, so use this ability judiciously. -## Prerequisites +### Prerequisites In order to set up the integration, you should have familiarity with: - [dbt Cloud Webhooks](/docs/deploy/webhooks) - Zapier -## Integration steps - -### 1. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +## Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. ![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) -### 2. Configure a new webhook in dbt Cloud +## Configure a new webhook in dbt Cloud See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**, and you need to change the **Jobs** list to only contain the job you want to trigger the next run. Make note of the Webhook Secret Key for later. @@ -31,14 +37,14 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. -### 3. Store secrets +## Store secrets In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). -### 4. Add a code action +## Add a code action Select **Code by Zapier** as the App, and **Run Python** as the Event. In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. @@ -87,5 +93,6 @@ if hook_data['runStatus'] == "Success": return ``` -### 5. Test and deploy +## Test and deploy + When you're happy with it, remember to ensure that your `account_id` is no longer hardcoded, then publish your Zap. diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md b/website/docs/guides/zapier-refresh-mode-report.md similarity index 89% rename from website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md rename to website/docs/guides/zapier-refresh-mode-report.md index 99680c432b3..5bab165b11d 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md +++ b/website/docs/guides/zapier-refresh-mode-report.md @@ -1,10 +1,18 @@ --- title: "Refresh a Mode dashboard when a job completes" -id: webhooks-guide-zapier-refresh-mode-report -slug: zapier-refresh-mode-report -description: Use Zapier to trigger a Mode dashboard refresh +id: zapier-refresh-mode-report +description: Use Zapier to trigger a Mode dashboard refresh when a dbt Cloud job completes. +hoverSnippet: Learn how to use Zapier to trigger a Mode dashboard refresh when a dbt Cloud job completes. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- +## Introduction + This guide will teach you how to refresh a Mode dashboard when a dbt Cloud job has completed successfully and there is fresh data available. The integration will: - Receive a webhook notification in Zapier @@ -12,23 +20,21 @@ This guide will teach you how to refresh a Mode dashboard when a dbt Cloud job h Although we are using the Mode API for a concrete example, the principles are readily transferrable to your [tool](https://learn.hex.tech/docs/develop-logic/hex-api/api-reference#operation/RunProject) [of](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/refresh-dataset) [choice](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref.htm#update_workbook_now). -## Prerequisites +### Prerequisites In order to set up the integration, you should have familiarity with: - [dbt Cloud Webhooks](/docs/deploy/webhooks) - Zapier - The [Mode API](https://mode.com/developer/api-reference/introduction/) -## Integration steps - -### 1. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +## Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. ![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) -### 2. Configure a new webhook in dbt Cloud +## Configure a new webhook in dbt Cloud See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**, and you need to change the **Jobs** list to only contain any jobs whose completion should trigger a report refresh. Make note of the Webhook Secret Key for later. @@ -37,20 +43,19 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. -### 3. Store secrets +## Store secrets In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens), as well as a [Mode API token and secret](https://mode.com/developer/api-reference/authentication/). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). - This guide assumes the names for the secret keys are: `DBT_WEBHOOK_KEY`, `MODE_API_TOKEN`, and `MODE_API_SECRET`. If you are using different names, make sure you update all references to them in the sample code. This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3). -#### a. Create a Storage by Zapier connection +### a. Create a Storage by Zapier connection If you haven't already got one, go to and create a new connection. Remember the UUID secret you generate for later. -#### b. Add a temporary code step +### b. Add a temporary code step Choose **Run Python** as the Event. Run the following code: ```python store = StoreClient('abc123') #replace with your UUID secret @@ -60,7 +65,7 @@ store.set('MODE_API_SECRET', 'abc123') #replace with your Mode API Secret ``` Test the step. You can delete this Action when the test succeeds. The key will remain stored as long as it is accessed at least once every three months. -### 4. Add a code action +## Add a code action Select **Code by Zapier** as the App, and **Run Python** as the Event. In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. @@ -124,5 +129,5 @@ if hook_data['runStatus'] == "Success": return ``` -### 5. Test and deploy -You can iterate on the Code step by modifying the code and then running the test again. When you're happy with it, you can publish your Zap. \ No newline at end of file +## Test and deploy +You can iterate on the Code step by modifying the code and then running the test again. When you're happy with it, you can publish your Zap. diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md b/website/docs/guides/zapier-refresh-tableau-workbook.md similarity index 90% rename from website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md rename to website/docs/guides/zapier-refresh-tableau-workbook.md index 8751528565c..f614b64eaa2 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md +++ b/website/docs/guides/zapier-refresh-tableau-workbook.md @@ -1,16 +1,24 @@ --- title: "Refresh Tableau workbook with extracts after a job finishes" -id: webhooks-guide-zapier-refresh-tableau-workbook -slug: zapier-refresh-tableau-workbook -description: Use Zapier to trigger a Tableau workbook refresh +id: zapier-refresh-tableau-workbook +description: Use Zapier to trigger a Tableau workbook refresh once a dbt Cloud job completes successfully. +hoverSnippet: Learn how to use Zapier to trigger a Tableau workbook refresh once a dbt Cloud job completes successfully. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- +## Introduction + This guide will teach you how to refresh a Tableau workbook that leverages [extracts](https://help.tableau.com/current/pro/desktop/en-us/extracting_data.htm) when a dbt Cloud job has completed successfully and there is fresh data available. The integration will: - Receive a webhook notification in Zapier - Trigger a refresh of a Tableau workbook -## Prerequisites +### Prerequisites To set up the integration, you need to be familiar with: @@ -19,19 +27,18 @@ To set up the integration, you need to be familiar with: - The [Tableau API](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api.htm) - The [version](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_concepts_versions.htm#rest_api_versioning) of Tableau's REST API that is compatible with your server -## Integration steps -### 1. Obtain authentication credentials from Tableau +## Obtain authentication credentials from Tableau To authenticate with the Tableau API, obtain a [Personal Access Token](https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm) from your Tableau Server/Cloud instance. In addition, make sure your Tableau workbook uses data sources that allow refresh access, which is usually set when publishing. -### 2. Create a new Zap in Zapier -To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead. +## Create a new Zap in Zapier +To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. ![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) -### 3. Configure a new webhook in dbt Cloud +## Configure a new webhook in dbt Cloud To set up a webhook subscription for dbt Cloud, follow the instructions in [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription). For the event, choose **Run completed** and modify the **Jobs** list to include only the jobs that should trigger a report refresh. Remember to save the Webhook Secret Key for later. Paste in the webhook URL obtained from Zapier in step 2 into the **Endpoint** field and test the endpoint. @@ -40,7 +47,7 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. -### 4. Store secrets +## Store secrets In the next step, you will need the Webhook Secret Key from the prior step, and your Tableau authentication credentials and details. Specifically, you'll need your Tableau server/site URL, server/site name, PAT name, and PAT secret. Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). @@ -49,11 +56,11 @@ This guide assumes the names for the secret keys are: `DBT_WEBHOOK_KEY`, `TABLEA This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3). -#### a. Create a Storage by Zapier connection +### a. Create a Storage by Zapier connection Create a new connection at https://zapier.com/app/connections/storage if you don't already have one and remember the UUID secret you generate for later. -#### b. Add a temporary code step +### b. Add a temporary code step Choose **Run Python** as the Event and input the following code: @@ -68,7 +75,7 @@ store.set('TABLEAU_API_TOKEN_SECRET', 'abc123') #replace with your Tableau API S Test the step to run the code. You can delete this action when the test succeeds. The keys will remain stored as long as it is accessed at least once every three months. -### 5. Add a code action +## Add a code action Select **Code by Zapier** as the App, and **Run Python** as the Event. In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. @@ -161,5 +168,5 @@ refresh_trigger = requests.post(refresh_url, data=json.dumps(refresh_data), head return {"message": "Workbook refresh has been queued"} ``` -### 6. Test and deploy +## Test and deploy To make changes to your code, you can modify it and test it again. When you're happy with it, you can publish your Zap. diff --git a/website/docs/guides/orchestration/webhooks/zapier-slack.md b/website/docs/guides/zapier-slack.md similarity index 93% rename from website/docs/guides/orchestration/webhooks/zapier-slack.md rename to website/docs/guides/zapier-slack.md index d3b0473502b..61b96658f95 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-slack.md +++ b/website/docs/guides/zapier-slack.md @@ -1,11 +1,19 @@ --- title: "Post to Slack with error context when a job fails" -id: webhooks-guide-zapier-slack -slug: zapier-slack -description: Use Zapier and the dbt Cloud API to post error context to Slack +id: zapier-slack +description: Use a webhook or Slack message to trigger Zapier and post error context in Slack when a job fails. +hoverSnippet: Learn how to use a webhook or Slack message to trigger Zapier to post error context in Slack when a job fails. +# time_to_complete: '30 minutes' commenting out until we test +icon: 'guides' +hide_table_of_contents: true +tags: ['Webhooks'] +level: 'Advanced' +recently_updated: true --- -This guide will show you how to set up an integration between dbt Cloud jobs and Slack using [dbt Cloud webhooks](/docs/deploy/webhooks) and Zapier. It builds on the native [native Slack integration](/faqs/accounts/slack) by attaching error message details of models and tests in a thread. +## Introduction + +This guide will show you how to set up an integration between dbt Cloud jobs and Slack using [dbt Cloud webhooks](/docs/deploy/webhooks) and Zapier. It builds on the native [native Slack integration](/docs/deploy/job-notifications#slack-notifications) by attaching error message details of models and tests in a thread. Note: Because there is not a webhook for Run Cancelled, you may want to keep the standard Slack integration installed to receive those notifications. You could also use the [alternative integration](#alternate-approach) that augments the native integration without replacing it. @@ -17,21 +25,20 @@ When a dbt Cloud job finishes running, the integration will: - Create a threaded message attached to that post which contains any reasons that the job failed ![Screenshot of a message in Slack showing a summary of a dbt Cloud run which failed](/img/guides/orchestration/webhooks/zapier-slack/slack-thread-example.png) -## Prerequisites + +### Prerequisites In order to set up the integration, you should have familiarity with: - [dbt Cloud webhooks](/docs/deploy/webhooks) - Zapier -## Integration steps - -### 1. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. -Click **Continue**, then copy the webhook URL. +## Create a new Zap in Zapier +1. Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +2. Click **Continue**, then copy the webhook URL. ![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) -### 2. Configure a new webhook in dbt Cloud +## Configure a new webhook in dbt Cloud See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Choose **Run completed** as the Event. You can alternatively choose **Run errored**, but you will need to account for the fact that the necessary metadata [might not be available immediately](/docs/deploy/webhooks#completed-errored-event-difference). Remember the Webhook Secret Key for later. @@ -40,7 +47,7 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test The sample body's values are hardcoded and not reflective of your project, but they give Zapier a correctly-shaped object during development. -### 3. Store secrets +## Store secrets In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps). This prevents your keys from being displayed as plaintext in the Zap code. You can access them with the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). @@ -48,7 +55,7 @@ Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8 -### 4. Add a code action +## Add a code action Select **Code by Zapier** as the App, and **Run Python** as the Event. In the **Set up action** section, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the previous **Catch Raw Hook** step. @@ -153,7 +160,7 @@ send_error_thread = len(threaded_errors_post) > 0 output = {'step_summary_post': step_summary_post, 'send_error_thread': send_error_thread, 'threaded_errors_post': threaded_errors_post} ``` -### 5. Add Slack actions in Zapier +## Add Slack actions in Zapier Select **Slack** as the App, and **Send Channel Message** as the Action. In the **Action** section, choose which **Channel** to post to. Set the **Message Text** field to **2. Step Summary Post** from the Run Python in Code by Zapier output. @@ -170,11 +177,11 @@ Add another **Send Channel Message in Slack** action. In the **Action** section, ![Screenshot of the Zapier UI, showing the mappings of prior steps to a Slack message](/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config.png) -### 7. Test and deploy +## Test and deploy When you're done testing your Zap, make sure that your `run_id` and `account_id` are no longer hardcoded in the Code step, then publish your Zap. -## Alternate approach +## Alternately, use a dbt Cloud app Slack message to trigger Zapier Instead of using a webhook as your trigger, you can keep the existing dbt Cloud app installed in your Slack workspace and use its messages being posted to your channel as the trigger. In this case, you can skip validating the webhook and only need to load the context from the thread. diff --git a/website/docs/quickstarts/manual-install-qs.md b/website/docs/quickstarts/manual-install-qs.md deleted file mode 100644 index ea3c6c7ec84..00000000000 --- a/website/docs/quickstarts/manual-install-qs.md +++ /dev/null @@ -1,460 +0,0 @@ ---- -title: "Quickstart for dbt Core from a manual install" -id: manual-install -description: "Connecting your warehouse to dbt Core using the CLI." -sidebar_label: "Manual install quickstart" -platform: 'dbt-core' -icon: 'fa-light fa-square-terminal' -hide_table_of_contents: true ---- -## Introduction - -When you use dbt Core to work with dbt, you will be editing files locally using a code editor, and running projects using the dbt command line interface (dbt CLI). If you'd rather edit files and run projects using the web-based Integrated Development Environment (IDE), you should refer to the [dbt Cloud quickstarts](/quickstarts). - -### Prerequisites - -* To use the dbt CLI, it's important that you know some basics of the Terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily. -* Install dbt Core using the [installation instructions](/docs/core/installation) for your operating system. -* Complete [Setting up (in BigQuery)](/quickstarts/bigquery?step=2) and [Loading data (BigQuery)](/quickstarts/bigquery?step=3). -* [Create a GitHub account](https://github.com/join) if you don't already have one. - -## Create a starter project - -After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models. - -### Create a repository - -The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join). - -1. [Create a new GitHub repository](https://github.com/new) named `dbt-tutorial`. -2. Select **Public** so the repository can be shared with others. You can always make it private later. -3. Leave the default values for all other settings. -4. Click **Create repository**. -5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes). - -### Create a project - -Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project. - -To create your dbt project: - -1. Make sure you have dbt Core installed and check the version using the `dbt --version` command: - - ```terminal - dbt --version - ``` - -2. Initiate the `jaffle_shop` project using the `init` command: - - ```terminal - dbt init jaffle_shop - ``` - -3. Navigate into your project's directory: - - ```terminal - cd jaffle_shop - ``` - -4. Use `pwd` to confirm that you are in the right spot: - - ```terminal - $ pwd - > Users/BBaggins/dbt-tutorial/jaffle_shop - ``` - -5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command. - -
      - -
      - -6. Update the following values in the `dbt_project.yml` file: - - - - ```yaml - name: jaffle_shop # Change from the default, `my_new_project` - - ... - - profile: jaffle_shop # Change from the default profile name, `default` - - ... - - models: - jaffle_shop: # Change from `my_new_project` to match the previous value for `name:` - ... - ``` - - - -### Connect to BigQuery - -When developing locally, dbt connects to your using a [profile](/docs/core/connect-data-platform/connection-profiles), which is a YAML file with all the connection details to your warehouse. - -1. Create a file in the `~/.dbt/` directory named `profiles.yml`. -2. Move your BigQuery keyfile into this directory. -3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted. - - - - ```yaml - jaffle_shop: # this needs to match the profile in your dbt_project.yml file - target: dev - outputs: - dev: - type: bigquery - method: service-account - keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile - project: grand-highway-265418 # Replace this with your project id - dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo - threads: 1 - timeout_seconds: 300 - location: US - priority: interactive - ``` - - - -4. Run the `debug` command from your project to confirm that you can successfully connect: - - ```terminal - $ dbt debug - > Connection test: OK connection ok - ``` - -
      - -
      - -#### FAQs - - - - - - - -### Perform your first dbt run - -Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order. - -1. Enter the `run` command to build example models: - - ```terminal - dbt run - ``` - -You should have an output that looks like this: -
      - -
      - -### Commit your changes - -Commit your changes so that the repository contains the latest code. - -1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository). - - ```terminal - git init - git branch -M main - git add . - git commit -m "Create a dbt project" - git remote add origin https://github.com/USERNAME/dbt-tutorial.git - git push -u origin main - ``` - -2. Return to your GitHub repository to verify your new files have been added. - -## Build your first models - -Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! You will take a sample query and turn it into a model in your dbt project. - -### Checkout a new git branch - -Check out a new git branch to work on new code: - -1. Create a new branch by using the `checkout` command and passing the `-b` flag: - - ```terminal - $ git checkout -b add-customers-model - > Switched to a new branch `add-customer-model` - ``` - -### Build your first model - -1. Open your project in your favorite code editor. -2. Create a new SQL file in the `models` directory, named `models/customers.sql`. -3. Paste the following query into the `models/customers.sql` file. - - - -4. From the command line, enter `dbt run`. -
      - -
      - -When you return to the BigQuery console, you can `select` from this model. - -#### FAQs - - - - - - - -### Change the way your model is materialized - - - - - -### Delete the example models - - - -### Build models on top of other models - - - -1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. -2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. - - - -
      - - - - ```sql - select - id as customer_id, - first_name, - last_name - - from `dbt-tutorial`.jaffle_shop.customers - ``` - - - - - - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status - - from `dbt-tutorial`.jaffle_shop.orders - ``` - - - -
      - -
      - - - - ```sql - select - id as customer_id, - first_name, - last_name - - from jaffle_shop_customers - ``` - - - - - - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status - - from jaffle_shop_orders - ``` - - - -
      - -
      - - - - ```sql - select - id as customer_id, - first_name, - last_name - - from jaffle_shop.customers - ``` - - - - - - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status - - from jaffle_shop.orders - ``` - - - -
      - -
      - - - - ```sql - select - id as customer_id, - first_name, - last_name - - from raw.jaffle_shop.customers - ``` - - - - - - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status - - from raw.jaffle_shop.orders - ``` - - - -
      - -
      - -3. Edit the SQL in your `models/customers.sql` file as follows: - - - - ```sql - with customers as ( - - select * from {{ ref('stg_customers') }} - - ), - - orders as ( - - select * from {{ ref('stg_orders') }} - - ), - - customer_orders as ( - - select - customer_id, - - min(order_date) as first_order_date, - max(order_date) as most_recent_order_date, - count(order_id) as number_of_orders - - from orders - - group by 1 - - ), - - final as ( - - select - customers.customer_id, - customers.first_name, - customers.last_name, - customer_orders.first_order_date, - customer_orders.most_recent_order_date, - coalesce(customer_orders.number_of_orders, 0) as number_of_orders - - from customers - - left join customer_orders using (customer_id) - - ) - - select * from final - - ``` - - - -4. Execute `dbt run`. - - This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. - -#### FAQs {#faq-2} - - - - - -### Next steps - - - -You can also explore: - -* The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL. -* The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the select statements that are running and the python logging happening when dbt runs. - -## Test and document your project - -### Add tests to your models - - - -### Document your models - - - -3. Run `dbt docs serve` command to launch the documentation in a local website. - -#### FAQs - - - - - -#### Next steps - - - -### Commit updated changes - -You need to commit the changes you made to the project so that the repository has your latest code. - -1. Add all your changes to git: `git add -A` -2. Commit your changes: `git commit -m "Add customers model, tests, docs"` -3. Push your changes to your repository: `git push` -4. Navigate to your repository, and open a pull request to merge the code into your master branch. - -## Schedule a job - -We recommend using dbt Cloud to schedule a job. For more information about using dbt Core to schedule a job, see [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post or [deployments](/docs/deploy/deployments). diff --git a/website/docs/reference/analysis-properties.md b/website/docs/reference/analysis-properties.md index 008da70f9db..880aeddbb0d 100644 --- a/website/docs/reference/analysis-properties.md +++ b/website/docs/reference/analysis-properties.md @@ -2,7 +2,9 @@ title: Analysis properties --- -We recommend you define analysis properties in your `analyses/` directory, which is illustrated in the [`analysis-paths`](/reference/project-configs/analysis-paths) configuration. +import PropsCallout from '/snippets/_config-prop-callout.md'; + +We recommend you define analysis properties in your `analyses/` directory, which is illustrated in the [`analysis-paths`](/reference/project-configs/analysis-paths) configuration.
      You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `analyses/` or `models/` directory. @@ -28,10 +30,3 @@ analyses: ``` - - - - -* `v0.16.0`: The ability to declare analysis properties was introduced. - - diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md index b20c1548d99..859fde7c908 100644 --- a/website/docs/reference/artifacts/dbt-artifacts.md +++ b/website/docs/reference/artifacts/dbt-artifacts.md @@ -3,12 +3,15 @@ title: "About dbt artifacts" sidebar_label: "About dbt artifacts" --- -With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power: +With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`semantic_manifest.json`, `manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power: + - [documentation](/docs/collaborate/documentation) - [state](/reference/node-selection/syntax#about-node-selection) - [visualizing source freshness](/docs/build/sources#snapshotting-source-data-freshness) They could also be used to: + +- gain insights into your [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) - calculate project-level test coverage - perform longitudinal analysis of run timing - identify historical changes in structure @@ -19,6 +22,7 @@ dbt has produced artifacts since the release of dbt-docs in v0.11.0. Starting in ## When are artifacts produced? Most dbt commands (and corresponding RPC methods) produce artifacts: +- [semantic manifest](/docs/dbt-cloud-apis/sl-manifest): Lives in the `/target` directory of your dbt project and stores various artifacts (such as compiled models and tests) generated during the execution of your project. - [manifest](/reference/artifacts/manifest-json): produced by commands that read and understand your project - [run results](/reference/artifacts/run-results-json): produced by commands that run, compile, or catalog nodes in your DAG - [catalog](catalog-json): produced by `docs generate` @@ -26,8 +30,6 @@ Most dbt commands (and corresponding RPC methods) produce artifacts: ## Common metadata -New in v0.19.0 - All artifacts produced by dbt include a `metadata` dictionary with these properties: - `dbt_version`: Version of dbt that produced this artifact. diff --git a/website/docs/reference/artifacts/manifest-json.md b/website/docs/reference/artifacts/manifest-json.md index 3a916ed6d4c..47a9849eda5 100644 --- a/website/docs/reference/artifacts/manifest-json.md +++ b/website/docs/reference/artifacts/manifest-json.md @@ -3,15 +3,9 @@ title: "Manifest JSON file" sidebar_label: "Manifest" --- -| dbt Core version | Manifest version | -|------------------|---------------------------------------------------------------| -| v1.6 | [v10](https://schemas.getdbt.com/dbt/manifest/v10/index.html) | -| v1.5 | [v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html) | -| v1.4 | [v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html) | -| v1.3 | [v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html) | -| v1.2 | [v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html) | -| v1.1 | [v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html) | -| v1.0 | [v4](https://schemas.getdbt.com/dbt/manifest/v4/index.html) | +import ManifestVersions from '/snippets/_manifest-versions.md'; + + **Produced by:** Any command that parses your project. This includes all commands **except** [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), [`init`](/reference/commands/init) @@ -53,12 +47,4 @@ You can refer to [dbt JSON Schema](https://schemas.getdbt.com/) for info on desc **Note**: The `manifest.json` version number is related to (but not _equal_ to) your dbt version, so you _must_ use the correct `manifest.json` version for your dbt version. To find the correct `manifest.json` version, select the dbt version on the top navigation (such as `v1.5`). -Use the following table to understand how the versioning pattern works and match the Manifest version with the dbt version: - -| dbt version | Manifest version | -| ----------- | ---------------- | -| `v1.5` | [Manifest v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html) -| `v1.4` | [Manifest v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html) -| `v1.3` | [Manifest v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html) -| `v1.2` | [Manifest v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html) -| `v1.1` | [Manifest v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html) +Refer to the table at the beginning of [this page](/reference/artifacts/manifest-json) to understand how the Manifest version matches the dbt version. diff --git a/website/docs/reference/artifacts/other-artifacts.md b/website/docs/reference/artifacts/other-artifacts.md index d776bc8a099..205bdfc1a14 100644 --- a/website/docs/reference/artifacts/other-artifacts.md +++ b/website/docs/reference/artifacts/other-artifacts.md @@ -39,4 +39,8 @@ This file is useful for investigating performance issues in dbt Core's graph alg It is more anonymized and compact than [`manifest.json`](/reference/artifacts/manifest-json) and [`graph.gpickle`](#graph.gpickle). -It contains only the `name` and `type` of each node along with IDs of its child nodes (`succ`). It includes that information at two separate points in time: immediately after the graph is linked together (`linked`), and after test edges have been added (`with_test_edges`). +It includes that information at two separate points in time: +1. `linked` — immediately after the graph is linked together, and +2. `with_test_edges` — after test edges have been added. + +Each of those points in time contains the `name` and `type` of each node and `succ` contains the keys of its child nodes. diff --git a/website/docs/reference/artifacts/run-results-json.md b/website/docs/reference/artifacts/run-results-json.md index dd92a9c4e53..5b3549db55b 100644 --- a/website/docs/reference/artifacts/run-results-json.md +++ b/website/docs/reference/artifacts/run-results-json.md @@ -3,7 +3,7 @@ title: "Run results JSON file" sidebar_label: "Run results" --- -**Current schema**: [`v4`](https://schemas.getdbt.com/dbt/run-results/v4/index.html) +**Current schema**: [`v5`](https://schemas.getdbt.com/dbt/run-results/v5/index.html) **Produced by:** [`build`](/reference/commands/build) diff --git a/website/docs/reference/commands/clean.md b/website/docs/reference/commands/clean.md index 0185b701740..23a3f6080ce 100644 --- a/website/docs/reference/commands/clean.md +++ b/website/docs/reference/commands/clean.md @@ -4,12 +4,6 @@ sidebar_label: "clean" id: "clean" --- - - -- **v1.0.0:** `dbt_modules` has been replaced by `dbt_packages` by default for the [clean-target](/reference/project-configs/clean-targets) for packages. - - - `dbt clean` is a utility function that deletes all folders specified in the [`clean-targets`](/reference/project-configs/clean-targets) list specified in `dbt_project.yml`. You can use this to delete the `dbt_packages` and `target` directories. To avoid complex permissions issues and potentially deleting crucial aspects of the remote file system without access to fix them, this command does not work when interfacing with the RPC server that powers the dbt Cloud IDE. Instead, when working in dbt Cloud, the `dbt deps` command cleans before it installs packages automatically. The `target` folder can be manually deleted from the sidebar file tree if needed. diff --git a/website/docs/reference/commands/clone.md b/website/docs/reference/commands/clone.md index 32c8a89be04..6bdc2c02e07 100644 --- a/website/docs/reference/commands/clone.md +++ b/website/docs/reference/commands/clone.md @@ -13,15 +13,16 @@ The `dbt clone` command clones selected nodes from the [specified state](/refere The `clone` command is useful for: - blue/green continuous deployment (on data warehouses that support zero-copy cloning tables) - cloning current production state into development schema(s) -- handling incremental models in Slim CI dbt Cloud jobs (on data warehouses that support zero-copy cloning tables) +- handling incremental models in dbt Cloud CI jobs (on data warehouses that support zero-copy cloning tables) - testing code changes on downstream dependencies in your BI tool + ```bash # clone all of my models from specified state to my target schema(s) dbt clone --state path/to/artifacts # clone one_specific_model of my models from specified state to my target schema(s) -dbt clone --select one_specific_model --state path/to/artifacts +dbt clone --select "one_specific_model" --state path/to/artifacts # clone all of my models from specified state to my target schema(s) and recreate all pre-existing relations in the current target dbt clone --state path/to/artifacts --full-refresh @@ -37,3 +38,19 @@ Unlike deferral, `dbt clone` requires some compute and creation of additional ob For example, by creating actual data warehouse objects, `dbt clone` allows you to test out your code changes on downstream dependencies _outside of dbt_ (such as a BI tool). As another example, you could `clone` your modified incremental models as the first step of your dbt Cloud CI job to prevent costly `full-refresh` builds for warehouses that support zero-copy cloning. + +## Cloning in dbt Cloud + +You can clone nodes between states in dbt Cloud using the `dbt clone` command. This is available in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) and the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) and relies on the [`--defer`](/reference/node-selection/defer) feature. For more details on defer in dbt Cloud, read [Using defer in dbt Cloud](/docs/cloud/about-cloud-develop-defer). + +- **Using dbt Cloud CLI** — The `dbt clone` command in the dbt Cloud CLI automatically includes the `--defer` flag. This means you can use the `dbt clone` command without any additional setup. + +- **Using dbt Cloud IDE** — To use the `dbt clone` command in the dbt Cloud IDE, follow these steps before running the `dbt clone` command: + + - Set up your **Production environment** and have a successful job run. + - Enable **Defer to production** by toggling the switch in the lower-right corner of the command bar. + + - Run the `dbt clone` command from the command bar. + + +Check out [this Developer blog post](https://docs.getdbt.com/blog/to-defer-or-to-clone) for more details on best practices when to use `dbt clone` vs. deferral. diff --git a/website/docs/reference/commands/cmd-docs.md b/website/docs/reference/commands/cmd-docs.md index 754c5e93baf..bc4840464b8 100644 --- a/website/docs/reference/commands/cmd-docs.md +++ b/website/docs/reference/commands/cmd-docs.md @@ -19,6 +19,18 @@ The command is responsible for generating your project's documentation website b dbt docs generate ``` + + +Use the `--select` argument to limit the nodes included within `catalog.json`. When this flag is provided, step (3) will be restricted to the selected nodes. All other nodes will be excluded. Step (2) is unaffected. + +**Example**: +```shell +dbt docs generate --select +orders +``` + + + + Use the `--no-compile` argument to skip re-compilation. When this flag is provided, `dbt docs generate` will skip step (2) described above. **Example**: diff --git a/website/docs/reference/commands/compile.md b/website/docs/reference/commands/compile.md index a9821c0ff12..cde65b7c6b6 100644 --- a/website/docs/reference/commands/compile.md +++ b/website/docs/reference/commands/compile.md @@ -29,7 +29,7 @@ This will log the compiled SQL to the terminal, in addition to writing to the `t For example: ```bash -dbt compile --select stg_payments +dbt compile --select "stg_payments" dbt compile --inline "select * from {{ ref('raw_orders') }}" ``` @@ -37,7 +37,7 @@ returns the following: ```bash -dbt compile --select stg_orders +dbt compile --select "stg_orders" 21:17:09 Running with dbt=1.5.0-b5 21:17:09 Found 5 models, 20 tests, 0 snapshots, 0 analyses, 425 macros, 0 operations, 3 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups 21:17:09 @@ -67,8 +67,8 @@ select * from renamed The command accesses the data platform to cache-related metadata, and to run introspective queries. Use the flags: -- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query. -- `--no-introspect` to disable [introspective queries](/faqs/warehouse/db-connection-dbt-compile#introspective-queries). dbt will raise an error if a model's definition requires running one. +- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query. This is a `dbt` flag, which means you need to add `dbt` as a prefix. For example: `dbt --no-populate-cache`. +- `--no-introspect` to disable [introspective queries](/faqs/warehouse/db-connection-dbt-compile#introspective-queries). dbt will raise an error if a model's definition requires running one. This is a `dbt compile` flag, which means you need to add `dbt compile` as a prefix. For example:`dbt compile --no-introspect`. ### FAQs diff --git a/website/docs/reference/commands/deps.md b/website/docs/reference/commands/deps.md index 4c7a36606e2..60ccd091ad7 100644 --- a/website/docs/reference/commands/deps.md +++ b/website/docs/reference/commands/deps.md @@ -57,3 +57,31 @@ Installing calogica/dbt_date@0.4.0 Updates available for packages: ['tailsdotcom/dbt_artifacts', 'dbt-labs/snowplow'] Update your versions in packages.yml, then run dbt deps ``` + + + +dbt generates the `package-lock.yml` file in the _project_root_ where `packages.yml` is recorded, which contains all the resolved packages, the first time you run `dbt deps`. Each subsequent run records the packages installed in this file. If the subsequent `dbt deps` runs contain no updated packages in `dependencies.yml` or `packages.yml`, dbt-core installs from `package-lock.yml`. + +When you update the package spec and run `dbt deps` again, the package-lock and package files update accordingly. You can run `dbt deps --lock` to update the `package-lock.yml` with the most recent dependencies from `packages`. + +The `--add-package` flag allows you to add a package to the `packages.yml` with configurable `--version` and `--source` information. The `--dry-run` flag, when set to `False`(default), recompiles the `package-lock.yml` file after a new package is added to the `packages.yml` file. Set the flag to `True` for the changes to not persist. + +Examples of the `--add-package` flag: +```shell +# add package from hub (--source arg defaults to "hub") +dbt deps --add-package dbt-labs/dbt_utils@1.0.0 + +# add package from hub with semantic version range +dbt deps --add-package dbt-labs/snowplow@">=0.7.0,<0.8.0" + +# add package from git +dbt deps --add-package https://github.com/fivetran/dbt_amplitude@v0.3.0 --source git + +# add package from local +dbt deps --add-package /opt/dbt/redshift --source local + +# add package to packages.yml and package-lock.yml WITHOUT actually installing dependencies +dbt deps --add-package dbt-labs/dbt_utils@1.0.0 --dry-run + +``` + diff --git a/website/docs/reference/commands/init.md b/website/docs/reference/commands/init.md index 468bee5ff60..e9cc2ccba4e 100644 --- a/website/docs/reference/commands/init.md +++ b/website/docs/reference/commands/init.md @@ -17,46 +17,28 @@ Then, it will: - Create a new folder with your project name and sample files, enough to get you started with dbt - Create a connection profile on your local machine. The default location is `~/.dbt/profiles.yml`. Read more in [configuring your profile](/docs/core/connect-data-platform/connection-profiles). -## Existing project + -If you've just cloned or downloaded an existing dbt project, `dbt init` can still help you set up your connection profile so that you can start working quickly. It will prompt you for connection information, as above, and add a profile (using the `profile` name from the project) to your local `profiles.yml`, or create the file if it doesn't already exist. +When using `dbt init` to initialize your project, include the `--profile` flag to specify an existing `profiles.yml` as the `profile:` key to use instead of creating a new one. For example, `dbt init --profile`. -## profile_template.yml -`dbt init` knows how to prompt for connection information by looking for a file named `profile_template.yml`. It will look for this file in two places: -- **Adapter plugin:** What's the bare minumum Postgres profile? What's the type of each field, what are its defaults? This information is stored in a file called [`dbt/include/postgres/profile_template.yml`](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml). If you're the maintainer of an adapter plugin, we highly recommend that you add a `profile_template.yml` to your plugin, too. See more details in [building-a-new-adapter](/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter). +If the profile does not exist in `profiles.yml` or the command is run inside an existing project, the command raises an error. -- **Existing project:** If you're the maintainer of an existing project, and you want to help new users get connected to your database quickly and easily, you can include your own custom `profile_template.yml` in the root of your project, alongside `dbt_project.yml`. For common connection attributes, set the values in `fixed`; leave user-specific attributes in `prompts`, but with custom hints and defaults as you'd like. + - +## Existing project - +If you've just cloned or downloaded an existing dbt project, `dbt init` can still help you set up your connection profile so that you can start working quickly. It will prompt you for connection information, as above, and add a profile (using the `profile` name from the project) to your local `profiles.yml`, or create the file if it doesn't already exist. -```yml -fixed: - account: abc123 - authenticator: externalbrowser - database: analytics - role: transformer - type: snowflake - warehouse: transforming -prompts: - user: - type: string - hint: yourname@jaffleshop.com - schema: - type: string - hint: usually dbt_ - threads: - hint: "your favorite number, 1-10" - type: int - default: 8 -``` - +## profile_template.yml - +`dbt init` knows how to prompt for connection information by looking for a file named `profile_template.yml`. It will look for this file in two places: + +- **Adapter plugin:** What's the bare minumum Postgres profile? What's the type of each field, what are its defaults? This information is stored in a file called [`dbt/include/postgres/profile_template.yml`](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml). If you're the maintainer of an adapter plugin, we highly recommend that you add a `profile_template.yml` to your plugin, too. Refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide for more information. + +- **Existing project:** If you're the maintainer of an existing project, and you want to help new users get connected to your database quickly and easily, you can include your own custom `profile_template.yml` in the root of your project, alongside `dbt_project.yml`. For common connection attributes, set the values in `fixed`; leave user-specific attributes in `prompts`, but with custom hints and defaults as you'd like. diff --git a/website/docs/reference/commands/list.md b/website/docs/reference/commands/list.md index 6084b3dec70..5caabdc2b2e 100644 --- a/website/docs/reference/commands/list.md +++ b/website/docs/reference/commands/list.md @@ -8,9 +8,10 @@ id: "list" The `dbt ls` command lists resources in your dbt project. It accepts selector arguments that are similar to those provided in [dbt run](/reference/commands/run). `dbt list` is an alias for `dbt ls`. While `dbt ls` will read your [connection profile](/docs/core/connect-data-platform/connection-profiles) to resolve [`target`](/reference/dbt-jinja-functions/target)-specific logic, this command will not connect to your database or run any queries. ### Usage + ``` dbt ls - [--resource-type {model,source,seed,snapshot,metric,test,exposure,analysis,default,all}] + [--resource-type {model,semantic_model,source,seed,snapshot,metric,test,exposure,analysis,default,all}] [--select SELECTION_ARG [SELECTION_ARG ...]] [--models SELECTOR [SELECTOR ...]] [--exclude SELECTOR [SELECTOR ...]] @@ -85,7 +86,7 @@ $ dbt ls --select snowplow.* --output json --output-keys "name resource_type des ``` -$ dbt ls --select snowplow.* --output json --output-keys name resource_type description +$ dbt ls --select snowplow.* --output json --output-keys "name resource_type description" {"name": "snowplow_events", "description": "This is a pretty cool model", ...} {"name": "snowplow_page_views", "description": "This model is even cooler", ...} ... @@ -93,6 +94,16 @@ $ dbt ls --select snowplow.* --output json --output-keys name resource_type desc + + +**Listing Semantic models** + +List all resources upstream of your orders semantic model: +``` +dbt ls -s +semantic_model:orders +``` + + **Listing file paths** ``` diff --git a/website/docs/reference/commands/retry.md b/website/docs/reference/commands/retry.md index 0c010ede2c1..8da5d5a77a6 100644 --- a/website/docs/reference/commands/retry.md +++ b/website/docs/reference/commands/retry.md @@ -20,3 +20,80 @@ Retry works with the following commands: `dbt retry` reuses the [selectors](/reference/node-selection/yaml-selectors) from the previously executed command. + +Example results of executing `dbt retry` after a successful `dbt run`: + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Nothing to do. Try checking your model configs and model specification args +``` + +Example of when `dbt run` encounters a syntax error in a model: + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 5 START sql view model main.stg_customers ................................. [RUN] +2 of 5 START sql view model main.stg_orders .................................... [RUN] +3 of 5 START sql view model main.stg_payments .................................. [RUN] +1 of 5 OK created sql view model main.stg_customers ............................ [OK in 0.06s] +2 of 5 OK created sql view model main.stg_orders ............................... [OK in 0.06s] +3 of 5 OK created sql view model main.stg_payments ............................. [OK in 0.07s] +4 of 5 START sql table model main.customers .................................... [RUN] +5 of 5 START sql table model main.orders ....................................... [RUN] +4 of 5 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s] +5 of 5 OK created sql table model main.orders .................................. [OK in 0.04s] + +Finished running 3 view models, 2 table models in 0 hours 0 minutes and 0.15 seconds (0.15s). + +Completed with 1 error and 0 warnings: + +Runtime Error in model customers (models/customers.sql) + Parser Error: syntax error at or near "selct" + +Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5 +``` + + +Example of a subsequent failed `dbt retry` run without fixing the error(s): + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 1 START sql table model main.customers .................................... [RUN] +1 of 1 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s] + +Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5 +``` + +Example of a successful `dbt retry` run after fixing error(s): + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 1 START sql table model main.customers .................................... [RUN] +1 of 1 OK created sql table model main.customers ............................... [OK in 0.05s] + +Finished running 1 table model in 0 hours 0 minutes and 0.09 seconds (0.09s). + +Completed successfully + +Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 +``` + +In each scenario `dbt retry` picks up from the error rather than running all of the upstream dependencies again. diff --git a/website/docs/reference/commands/rpc.md b/website/docs/reference/commands/rpc.md index a98799356ee..809eadee639 100644 --- a/website/docs/reference/commands/rpc.md +++ b/website/docs/reference/commands/rpc.md @@ -5,22 +5,18 @@ id: "rpc" description: "Remote Procedure Call (rpc) dbt server compiles and runs queries, and provides methods that enable you to list and terminate running processes. " --- - +:::caution The dbt-rpc plugin is deprecated - - **v0.14**: The `dbt rpc` command was introduced to dbt Core - - **v1.0**: We now distribute and package the Remote Procedure Call (rpc) server functionality separately from `dbt-core`. You can find the code in a dedicated [`dbt-rpc` repository](https://github.com/dbt-labs/dbt-rpc). - +dbt Labs actively maintained `dbt-rpc` for compatibility with dbt-core versions up to v1.5. Starting with dbt-core v1.6 (released in July 2023), `dbt-rpc` is no longer supported for ongoing compatibility. -### Overview +In the meantime, dbt Labs will be performing critical maintenance only for `dbt-rpc`, until the last compatible version of dbt-core has reached the [end of official support](/docs/dbt-versions/core#latest-releases). At that point, dbt Labs will archive this repository to be read-only. -You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context. +::: -:::caution Deprecation -**The dbt-rpc plugin will be fully deprecated by the second half of 2023.** +### Overview -dbt Labs is actively maintaining `dbt-rpc` up to dbt v1.4. Starting in v1.5, we intend to break `dbt-rpc` compatibility in favor of [the new dbt Server](https://github.com/dbt-labs/dbt-server). dbt Labs will perform critical maintenance only on `dbt-rpc`, until the last compatible version of dbt has reached the end of official support (thus 12 months after release of v1.4; [see Core version policies](/docs/dbt-versions/core)). -::: +You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context. :::caution Running on Windows We do not recommend running the rpc server on Windows because of reliability issues. A Docker container may provide a useful workaround, if required. diff --git a/website/docs/reference/commands/run.md b/website/docs/reference/commands/run.md index fbc1a513cb1..557d0d71338 100644 --- a/website/docs/reference/commands/run.md +++ b/website/docs/reference/commands/run.md @@ -71,32 +71,12 @@ For more information on running parents or children of specific models, see the ## Treat warnings as errors - - -- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0 - - - -See [global configs](/reference/global-configs/failing-fast) +See [global configs](/reference/global-configs/warnings) ## Failing fast - - -- The `--fail-fast` flag is new in dbt v0.17.0 -- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0 - - - See [global configs](/reference/global-configs/failing-fast) ## Enable or Disable Colorized Logs - - -- The `--use-colors` and `--no-use-colors` flags are new in dbt v0.18.0 -- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0 - - - See [global configs](/reference/global-configs/print-output#print-color) diff --git a/website/docs/reference/commands/seed.md b/website/docs/reference/commands/seed.md index 272a2a7f2a9..d0cd199ea12 100644 --- a/website/docs/reference/commands/seed.md +++ b/website/docs/reference/commands/seed.md @@ -4,24 +4,15 @@ sidebar_label: "seed" id: "seed" --- - - -- **v1.0.0:** The default config for this command will now be `seed-paths` instead of `data-paths`. - - - - The `dbt seed` command will load `csv` files located in the `seed-paths` directory of your dbt project into your . ### Selecting seeds to run - Added in v0.16.0 - Specific seeds can be run using the `--select` flag to `dbt seed`. Example: ``` -$ dbt seed --select country_codes +$ dbt seed --select "country_codes" Found 2 models, 3 tests, 0 archives, 0 analyses, 53 macros, 0 operations, 2 seed files 14:46:15 | Concurrency: 1 threads (target='dev') diff --git a/website/docs/reference/commands/show.md b/website/docs/reference/commands/show.md index 5bdcfacc1e8..a0e5d68c83f 100644 --- a/website/docs/reference/commands/show.md +++ b/website/docs/reference/commands/show.md @@ -16,7 +16,7 @@ The results of the preview query are not materialized in the data warehouse, or Example: ``` -dbt show --select model_name.sql +dbt show --select "model_name.sql" ``` or ``` @@ -26,7 +26,7 @@ dbt show --inline "select * from {{ ref('model_name') }}" The following is an example of `dbt show` output for a model named `stg_orders`: ```bash -dbt show --select stg_orders +dbt show --select "stg_orders" 21:17:38 Running with dbt=1.5.0-b5 21:17:38 Found 5 models, 20 tests, 0 snapshots, 0 analyses, 425 macros, 0 operations, 3 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups 21:17:38 @@ -46,7 +46,7 @@ dbt show --select stg_orders For example, if you've just built a model that has a failing test, you can quickly preview the test failures right in the terminal, to find values of `id` that are duplicated: ```bash -$ dbt build -s my_model_with_duplicates +$ dbt build -s "my_model_with_duplicates" 13:22:47 Running with dbt=1.5.0 ... 13:22:48 Completed with 1 error and 0 warnings: @@ -58,7 +58,7 @@ $ dbt build -s my_model_with_duplicates 13:22:48 13:22:48 Done. PASS=1 WARN=0 ERROR=1 SKIP=0 TOTAL=2 -$ dbt show -s unique_my_model_with_duplicates_id +$ dbt show -s "unique_my_model_with_duplicates_id" 13:22:53 Running with dbt=1.5.0 13:22:53 Found 4 models, 2 tests, 0 snapshots, 0 analyses, 309 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups 13:22:53 diff --git a/website/docs/reference/commands/source.md b/website/docs/reference/commands/source.md index b29bf7dadc6..697ae2b5fcc 100644 --- a/website/docs/reference/commands/source.md +++ b/website/docs/reference/commands/source.md @@ -20,10 +20,10 @@ By default, `dbt source freshness` will calculate freshness information for all ```bash # Snapshot freshness for all Snowplow tables: -$ dbt source freshness --select source:snowplow +$ dbt source freshness --select "source:snowplow" # Snapshot freshness for a particular source table: -$ dbt source freshness --select source:snowplow.event +$ dbt source freshness --select "source:snowplow.event" ``` ### Configuring source freshness output diff --git a/website/docs/reference/commands/test.md b/website/docs/reference/commands/test.md index a1a63729568..c050d82a0ab 100644 --- a/website/docs/reference/commands/test.md +++ b/website/docs/reference/commands/test.md @@ -10,22 +10,22 @@ The tests to run can be selected using the `--select` flag discussed [here](/ref ```bash # run tests for one_specific_model -dbt test --select one_specific_model +dbt test --select "one_specific_model" # run tests for all models in package -dbt test --select some_package.* +dbt test --select "some_package.*" # run only tests defined singularly -dbt test --select test_type:singular +dbt test --select "test_type:singular" # run only tests defined generically -dbt test --select test_type:generic +dbt test --select "test_type:generic" # run singular tests limited to one_specific_model -dbt test --select one_specific_model,test_type:singular +dbt test --select "one_specific_model,test_type:singular" # run generic tests limited to one_specific_model -dbt test --select one_specific_model,test_type:generic +dbt test --select "one_specific_model,test_type:generic" ``` For more information on writing tests, see the [Testing Documentation](/docs/build/tests). diff --git a/website/docs/reference/configs-and-properties.md b/website/docs/reference/configs-and-properties.md index c2ad5b77629..c6458babeaa 100644 --- a/website/docs/reference/configs-and-properties.md +++ b/website/docs/reference/configs-and-properties.md @@ -11,7 +11,7 @@ A rule of thumb: properties declare things _about_ your project resources; confi For example, you can use resource **properties** to: * Describe models, snapshots, seed files, and their columns -- Assert "truths" about a model, in the form of [tests](/docs/build/tests), e.g. "this `id` column is unique" +* Assert "truths" about a model, in the form of [tests](/docs/build/tests), e.g. "this `id` column is unique" * Define pointers to existing tables that contain raw data, in the form of [sources](/docs/build/sources), and assert the expected "freshness" of this raw data * Define official downstream uses of your data models, in the form of [exposures](/docs/build/exposures) @@ -35,11 +35,11 @@ dbt prioritizes configurations in order of specificity, from most specificity to Note - Generic tests work a little differently when it comes to specificity. See [test configs](/reference/test-configs). -Within the project file, configurations are also applied hierarchically. The most-specific config always "wins": In the project file, configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project. To apply a configuration to a model, or directory of models, define the resource path as nested dictionary keys. +Within the project file, configurations are also applied hierarchically. The most specific config always "wins": In the project file, configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project. To apply a configuration to a model, or directory of models, define the resource path as nested dictionary keys. ### Combining configs -Most configurations are "clobbered" when applied hierarchically. Whenever a more-specific value is available, it will completely replace the less-specific value. Note that a few configs have different merge behavior: +Most configurations are "clobbered" when applied hierarchically. Whenever a more specific value is available, it will completely replace the less specific value. Note that a few configs have different merge behavior: - [`tags`](tags) are additive. If a model has some tags configured in `dbt_project.yml`, and more tags applied in its `.sql` file, the final set of tags will include all of them. - [`meta`](/reference/resource-configs/meta) dictionaries are merged (a more specific key-value pair replaces a less specific value with the same key) - [`pre-hook` and `post-hook`](/reference/resource-configs/pre-hook-post-hook) are also additive. @@ -67,12 +67,14 @@ Previous versions of the docs referred to these as `schema.yml` files — we've dbt has the ability to define node configs in `.yml` files, in addition to `config()` blocks and `dbt_project.yml`. But the reverse isn't always true: there are some things in `.yml` files that can _only_ be defined there. Certain properties are special, because: + - They have a unique Jinja rendering context - They create new project resources - They don't make sense as hierarchical configuration - They're older properties that haven't yet been redefined as configs These properties are: + - [`description`](/reference/resource-properties/description) - [`tests`](/reference/resource-properties/tests) - [`docs`](/reference/resource-configs/docs) @@ -155,9 +157,9 @@ You can find an exhaustive list of each supported property and config, broken do * Model [properties](/reference/model-properties) and [configs](/reference/model-configs) * Source [properties](/reference/source-properties) and [configs](source-configs) * Seed [properties](/reference/seed-properties) and [configs](/reference/seed-configs) -* [Snapshot Properties](snapshot-properties) +* Snapshot [properties](snapshot-properties) * Analysis [properties](analysis-properties) -* [Macro Properties](/reference/macro-properties) +* Macro [properties](/reference/macro-properties) * Exposure [properties](/reference/exposure-properties) ## FAQs @@ -202,3 +204,4 @@ Runtime Error ``` This error occurred because a semicolon (`;`) was accidentally used instead of a colon (`:`) after the `description` field. To resolve issues like this, find the `.yml` file referenced in the error message and fix any syntax errors present in the file. There are online YAML validators that can be helpful here, but please be mindful of submitting sensitive information to third-party applications! + diff --git a/website/docs/reference/database-permissions/about-database-permissions.md b/website/docs/reference/database-permissions/about-database-permissions.md new file mode 100644 index 00000000000..76fff517646 --- /dev/null +++ b/website/docs/reference/database-permissions/about-database-permissions.md @@ -0,0 +1,36 @@ +--- +title: "Database permissions" +id: about-database-permissions +description: "Database permissions are access rights and privileges granted to users or roles within a database management system." +sidebar_label: "About database permissions" +pagination_next: "reference/database-permissions/databricks-permissions" +pagination_prev: null +--- + +Database permissions are access rights and privileges granted to users or roles within a database or data platform. They help you specify what actions users or roles can perform on various database objects, like tables, views, schemas, or even the entire database. + + +### Why are they useful + +- Database permissions are essential for security and data access control. +- They ensure that only authorized users can perform specific actions. +- They help maintain data integrity, prevent unauthorized changes, and limit exposure to sensitive data. +- Permissions also support compliance with data privacy regulations and auditing. + +### How to use them + +- Users and administrators can grant and manage permissions at various levels (such as table, schema, and so on) using SQL statements or through the database system's interface. +- Assign permissions to individual users or roles (groups of users) based on their responsibilities. + - Typical permissions include "SELECT" (read), "INSERT" (add data), "UPDATE" (modify data), "DELETE" (remove data), and administrative rights like "CREATE" and "DROP." +- Users should be assigned permissions that ensure they have the necessary access to perform their tasks without overextending privileges. + +Something to note is that each data platform provider might have different approaches and names for privileges. Refer to their documentation for more details. + +### Examples + +Refer to the following database permission pages for more info on examples and how to set up database permissions: + +- [Databricks](/reference/database-permissions/databricks-permissions) +- [Postgres](/reference/database-permissions/postgres-permissions) +- [Redshift](/reference/database-permissions/redshift-permissions) +- [Snowflake](/reference/database-permissions/snowflake-permissions) diff --git a/website/docs/reference/database-permissions/databricks-permissions.md b/website/docs/reference/database-permissions/databricks-permissions.md new file mode 100644 index 00000000000..12e24652ae3 --- /dev/null +++ b/website/docs/reference/database-permissions/databricks-permissions.md @@ -0,0 +1,20 @@ +--- +title: "Databricks permissions" +--- + +In Databricks, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Databricks database. + +## Example Databricks permissions + +The following example provides you with the SQL statements you can use to manage permissions. + +**Note** that you can grant permissions on `securable_objects` to `principals` (This can be user, service principal, or group). For example, `grant privilege_type` on `securable_object` to `principal`. + +``` + +grant all privileges on schema schema_name to principal; +grant create table on schema schema_name to principal; +grant create view on schema schema_name to principal; +``` + +Check out the [official documentation](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/privileges.html#privilege-types-by-securable-object-in-unity-catalog) for more information. diff --git a/website/docs/reference/database-permissions/postgres-permissions.md b/website/docs/reference/database-permissions/postgres-permissions.md new file mode 100644 index 00000000000..da56e9b45f2 --- /dev/null +++ b/website/docs/reference/database-permissions/postgres-permissions.md @@ -0,0 +1,25 @@ +--- +title: "Postgres Permissions" +--- + + +In Postgres, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Postgres database. + +## Example Postgres permissions + +The following example provides you with the SQL statements you can use to manage permissions. These examples allow you to run dbt smoothly without encountering permission issues, such as creating schemas, reading existing data, and accessing the information schema. + +**Note** that `database_name`, `database.schema_name`, and `user_name` are placeholders and you can replace them as needed for your organization's naming convention. + +``` +grant usage on database database_name to user_name; +grant create schema on database database_name to user_name; +grant usage on schema database.schema_name to user_name; +grant create table on schema database.schema_name to user_name; +grant create view on schema database.schema_name to user_name; +grant usage on all schemas in database database_name to user_name; +grant select on all tables in database database_name to user_name; +grant select on all views in database database_name to user_name; +``` + +Check out the [official documentation](https://www.postgresql.org/docs/current/sql-grant.html) for more information. diff --git a/website/docs/reference/database-permissions/redshift-permissions.md b/website/docs/reference/database-permissions/redshift-permissions.md new file mode 100644 index 00000000000..5f0949a3528 --- /dev/null +++ b/website/docs/reference/database-permissions/redshift-permissions.md @@ -0,0 +1,25 @@ +--- +title: "Redshift permissions" +--- + +In Redshift, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Redshift database. + +## Example Redshift permissions + +The following example provides you with the SQL statements you can use to manage permissions. + +**Note** that `database_name`, `database.schema_name`, and `user_name` are placeholders and you can replace them as needed for your organization's naming convention. + + +``` +grant usage on database database_name to user_name; +grant create schema on database database_name to user_name; +grant usage on schema database.schema_name to user_name; +grant create table on schema database.schema_name to user_name; +grant create view on schema database.schema_name to user_name; +grant usage on all schemas in database database_name to user_name; +grant select on all tables in database database_name to user_name; +grant select on all views in database database_name to user_name; +``` + +Check out the [official documentation](https://docs.aws.amazon.com/redshift/latest/dg/r_GRANT.html) for more information. diff --git a/website/docs/reference/database-permissions/snowflake-permissions.md b/website/docs/reference/database-permissions/snowflake-permissions.md new file mode 100644 index 00000000000..3f474242834 --- /dev/null +++ b/website/docs/reference/database-permissions/snowflake-permissions.md @@ -0,0 +1,154 @@ +--- +title: "Snowflake permissions" +--- + +In Snowflake, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Snowflake database. + +## Set up Snowflake account + +This section explains how to set up permissions and roles within Snowflake. In Snowflake, you would perform these actions using SQL commands and set up your data warehouse and access control within Snowflake's ecosystem. + +1. Set up databases +``` +use role sysadmin; +create database raw; +create database analytics; +``` +2. Set up warehouses +``` +create warehouse loading + warehouse_size = xsmall + auto_suspend = 3600 + auto_resume = false + initially_suspended = true; + +create warehouse transforming + warehouse_size = xsmall + auto_suspend = 60 + auto_resume = true + initially_suspended = true; + +create warehouse reporting + warehouse_size = xsmall + auto_suspend = 60 + auto_resume = true + initially_suspended = true; +``` + +3. Set up roles and warehouse permissions +``` +use role securityadmin; + +create role loader; +grant all on warehouse loading to role loader; + +create role transformer; +grant all on warehouse transforming to role transformer; + +create role reporter; +grant all on warehouse reporting to role reporter; +``` + +4. Create users, assigning them to their roles + +Every person and application gets a separate user and is assigned to the correct role. + +``` +create user stitch_user -- or fivetran_user + password = '_generate_this_' + default_warehouse = loading + default_role = loader; + +create user claire -- or amy, jeremy, etc. + password = '_generate_this_' + default_warehouse = transforming + default_role = transformer + must_change_password = true; + +create user dbt_cloud_user + password = '_generate_this_' + default_warehouse = transforming + default_role = transformer; + +create user looker_user -- or mode_user etc. + password = '_generate_this_' + default_warehouse = reporting + default_role = reporter; + +-- then grant these roles to each user +grant role loader to user stitch_user; -- or fivetran_user +grant role transformer to user dbt_cloud_user; +grant role transformer to user claire; -- or amy, jeremy +grant role reporter to user looker_user; -- or mode_user, periscope_user +``` + +5. Let loader load data +Give the role unilateral permission to operate on the raw database +``` +use role sysadmin; +grant all on database raw to role loader; +``` + +6. Let transformer transform data +The transformer role needs to be able to read raw data. + +If you do this before you have any data loaded, you can run: +``` +grant usage on database raw to role transformer; +grant usage on future schemas in database raw to role transformer; +grant select on future tables in database raw to role transformer; +grant select on future views in database raw to role transformer; +``` +If you already have data loaded in the raw database, make sure also you run the following to update the permissions +``` +grant usage on all schemas in database raw to role transformer; +grant select on all tables in database raw to role transformer; +grant select on all views in database raw to role transformer; +``` +transformer also needs to be able to create in the analytics database: +``` +grant all on database analytics to role transformer; +``` +7. Let reporter read the transformed data +A previous version of this article recommended this be implemented through hooks in dbt, but this way lets you get away with a one-off statement. +``` +grant usage on database analytics to role reporter; +grant usage on future schemas in database analytics to role reporter; +grant select on future tables in database analytics to role reporter; +grant select on future views in database analytics to role reporter; +``` +Again, if you already have data in your analytics database, make sure you run: +``` +grant usage on all schemas in database analytics to role reporter; +grant select on all tables in database analytics to role transformer; +grant select on all views in database analytics to role transformer; +``` +8. Maintain +When new users are added, make sure you add them to the right role! Everything else should be inherited automatically thanks to those `future` grants. + +For more discussion and legacy information, refer to [this Discourse article](https://discourse.getdbt.com/t/setting-up-snowflake-the-exact-grant-statements-we-run/439). + +## Example Snowflake permissions + +The following example provides you with the SQL statements you can use to manage permissions. + +**Note** that `warehouse_name`, `database_name`, and `role_name` are placeholders and you can replace them as needed for your organization's naming convention. + +``` + +grant all on warehouse warehouse_name to role role_name; +grant usage on database database_name to role role_name; +grant create schema on database database_name to role role_name; +grant usage on schema database.an_existing_schema to role role_name; +grant create table on schema database.an_existing_schema to role role_name; +grant create view on schema database.an_existing_schema to role role_name; +grant usage on future schemas in database database_name to role role_name; +grant monitor on future schemas in database database_name to role role_name; +grant select on future tables in database database_name to role role_name; +grant select on future views in database database_name to role role_name; +grant usage on all schemas in database database_name to role role_name; +grant monitor on all schemas in database database_name to role role_name; +grant select on all tables in database database_name to role role_name; +grant select on all views in database database_name to role role_name; +``` + diff --git a/website/docs/reference/dbt-classes.md b/website/docs/reference/dbt-classes.md index 18569fce3b0..13f9263e545 100644 --- a/website/docs/reference/dbt-classes.md +++ b/website/docs/reference/dbt-classes.md @@ -10,6 +10,7 @@ These classes are often useful when building advanced dbt models and macros. The `Relation` object is used to interpolate schema and names into SQL code with appropriate quoting. This object should _always_ be used instead of interpolating values with `{{ schema }}.{{ table }}` directly. Quoting of the Relation object can be configured using the [`quoting` config](/reference/project-configs/quoting). + ### Creating relations A `Relation` can be created by calling the `create` class method on the `Relation` class. @@ -32,6 +33,7 @@ class Relation: ### Using relations +In addition to `api.Relation.create`, dbt returns a Relation when you use [`ref`](/reference/dbt-jinja-functions/ref), [`source`](/reference/dbt-jinja-functions/source) or [`this`](/reference/dbt-jinja-functions/this). ```jinja2 @@ -84,6 +86,7 @@ col = Column('name', 'varchar', 255) col.is_string() # True col.is_numeric() # False col.is_number() # False +col.is_integer() # False col.is_float() # False col.string_type() # character varying(255) col.numeric_type('numeric', 12, 4) # numeric(12,4) @@ -101,15 +104,10 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4) ### Instance methods - - - The `is_number` and `is_float` instance methods were added dbt v0.16.0 - - - - **is_string()**: Returns True if the column is a String type (eg. text, varchar), else False - **is_numeric()**: Returns True if the column is a fixed-precision Numeric type (eg. `numeric`), else False - **is_number()**: Returns True if the column is a number-y type (eg. `numeric`, `int`, `float`, or similar), else False +- **is_integer()**: Returns True if the column is an integer (eg. `int`, `bigint`, `serial` or similar), else False - **is_float()**: Returns True if the column is a float type (eg. `float`, `float64`, or similar), else False - **string_size()**: Returns the width of the column if it is a string type, else, an exception is raised @@ -134,6 +132,9 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4) -- Return true if the column is a number {{ string_column.is_number() }} +-- Return true if the column is an integer +{{ string_column.is_integer() }} + -- Return true if the column is a float {{ string_column.is_float() }} @@ -149,6 +150,9 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4) -- Return true if the column is a number {{ numeric_column.is_number() }} +-- Return true if the column is an integer +{{ numeric_column.is_integer() }} + -- Return true if the column is a float {{ numeric_column.is_float() }} @@ -184,12 +188,6 @@ will be expanded to: ## Result objects - - -* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification. - - - The execution of a resource in dbt generates a `Result` object. This object contains information about the executed node, timing, status, and metadata returned by the adapter. At the end of an invocation, dbt records these objects in [`run_results.json`](/reference/artifacts/run-results-json). - `node`: Full object representation of the dbt resource (model, seed, snapshot, test) executed, including its `unique_id` @@ -197,7 +195,6 @@ The execution of a resource in dbt generates a `Result` object. This object cont - `thread_id`: Which thread executed this node? E.g. `Thread-1` - `execution_time`: Total time spent executing this node, measured in seconds. - `timing`: Array that breaks down execution time into steps (often `compile` + `execute`) -- `adapter_response`: Dictionary of metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc. - `message`: How dbt will report this result on the CLI, based on information returned from the database import RowsAffected from '/snippets/_run-result.md'; diff --git a/website/docs/reference/dbt-commands.md b/website/docs/reference/dbt-commands.md index 5b37f13a3fb..d5f0bfcd2ad 100644 --- a/website/docs/reference/dbt-commands.md +++ b/website/docs/reference/dbt-commands.md @@ -2,29 +2,63 @@ title: "dbt Command reference" --- -dbt is typically run one of two ways: -* In [dbt Cloud](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) -* On the [command line interface](/docs/core/about-the-cli) (CLI) +You can run dbt using the following tools: -The following sections outline the commands supported by dbt and their relevant flags. Note that some commands are only supported when using the CLI. +- In your browser with the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) +- On the command line interface using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) or open-source [dbt Core](/docs/core/about-dbt-core), both of which enable you to execute dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). -For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax). +The following sections outline the commands supported by dbt and their relevant flags. For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax). ### Available commands -Select the tabs that are relevant to the your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**. + + +All commands in the table are compatible with either the dbt Cloud IDE, dbt Cloud CLI, or dbt Core. + +You can run dbt commands in your specific tool by prefixing them with `dbt`. For example, to run the `test` command, type `dbt test`. + +| Command | Description | Compatible tools | Version | +| ------- | ----------- | ---------------- | ------- | +| [build](/reference/commands/build) | Build and test all selected resources (models, seeds, snapshots, tests) | All | All [supported versions](/docs/dbt-versions/core) | +| cancel | Cancels the most recent invocation.| dbt Cloud CLI | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [clean](/reference/commands/clean) | Deletes artifacts present in the dbt project | All | All [supported versions](/docs/dbt-versions/core) | +| [clone](/reference/commands/clone) | Clone selected models from the specified state | All | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [compile](/reference/commands/compile) | Compiles (but does not run) the models in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [debug](/reference/commands/debug) | Debugs dbt connections and projects | dbt Cloud IDE
      dbt Core | All [supported versions](/docs/dbt-versions/core) | +| [deps](/reference/commands/deps) | Downloads dependencies for a project | All | All [supported versions](/docs/dbt-versions/core) | +| [docs](/reference/commands/cmd-docs) | Generates documentation for a project | All | All [supported versions](/docs/dbt-versions/core) | +| help | Displays help information for any command | dbt Core
      dbt Cloud CLI | All [supported versions](/docs/dbt-versions/core) | +| [init](/reference/commands/init) | Initializes a new dbt project | dbt Core | All [supported versions](/docs/dbt-versions/core) | +| [list](/reference/commands/list) | Lists resources defined in a dbt project | All | All [supported versions](/docs/dbt-versions/core) | +| [parse](/reference/commands/parse) | Parses a project and writes detailed timing info | All | All [supported versions](/docs/dbt-versions/core) | +| reattach | Reattaches to the most recent invocation to retrieve logs and artifacts. | dbt Cloud CLI | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [retry](/reference/commands/retry) | Retry the last run `dbt` command from the point of failure | All | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [run](/reference/commands/run) | Runs the models in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [run-operation](/reference/commands/run-operation) | Invoke a macro, including running arbitrary maintenance SQL against the database | All | All [supported versions](/docs/dbt-versions/core) | +| [seed](/reference/commands/seed) | Loads CSV files into the database | All | All [supported versions](/docs/dbt-versions/core) | +| [show](/reference/commands/show) | Preview table rows post-transformation | All | All [supported versions](/docs/dbt-versions/core) | +| [snapshot](/reference/commands/snapshot) | Executes "snapshot" jobs defined in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [source](/reference/commands/source) | Provides tools for working with source data (including validating that sources are "fresh") | All | All [supported versions](/docs/dbt-versions/core) | +| [test](/reference/commands/test) | Executes tests defined in a project | All | All [supported versions](/docs/dbt-versions/core) | + + +
      + + + +Select the tabs that are relevant to your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**. - + Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`. - [build](/reference/commands/build): build and test all selected resources (models, seeds, snapshots, tests) -- [clone](/reference/commands/clone): clone selected nodes from specified state (requires dbt 1.6 or higher) +- [clone](/reference/commands/clone): clone selected nodes from the specified state (requires dbt 1.6 or higher) - [compile](/reference/commands/compile): compiles (but does not run) the models in a project - [deps](/reference/commands/deps): downloads dependencies for a project - [docs](/reference/commands/cmd-docs) : generates documentation for a project -- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or higher) +- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or later) - [run](/reference/commands/run): runs the models in a project - [run-operation](/reference/commands/run-operation): invoke a macro, including running arbitrary maintenance SQL against the database - [seed](/reference/commands/seed): loads CSV files into the database @@ -35,13 +69,13 @@ Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/ - + -Use the following dbt commands in the [CLI](/docs/core/about-the-cli) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`. +Use the following dbt commands in [dbt Core](/docs/core/about-dbt-core) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`. - [build](/reference/commands/build): build and test all selected resources (models, seeds, snapshots, tests) - [clean](/reference/commands/clean): deletes artifacts present in the dbt project -- [clone](/reference/commands/clone): clone selected models from specified state (requires dbt 1.6 or higher) +- [clone](/reference/commands/clone): clone selected models from the specified state (requires dbt 1.6 or higher) - [compile](/reference/commands/compile): compiles (but does not run) the models in a project - [debug](/reference/commands/debug): debugs dbt connections and projects - [deps](/reference/commands/deps): downloads dependencies for a project @@ -62,27 +96,4 @@ Use the following dbt commands in the [CLI](/docs/core/about-the-cli) and use th - - - + diff --git a/website/docs/reference/dbt-jinja-functions/as_bool.md b/website/docs/reference/dbt-jinja-functions/as_bool.md index e0700032212..d4c2bbf1743 100644 --- a/website/docs/reference/dbt-jinja-functions/as_bool.md +++ b/website/docs/reference/dbt-jinja-functions/as_bool.md @@ -24,10 +24,3 @@ models: ```
      - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_bool` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_native.md b/website/docs/reference/dbt-jinja-functions/as_native.md index fca25249dca..1de9ad45bf9 100644 --- a/website/docs/reference/dbt-jinja-functions/as_native.md +++ b/website/docs/reference/dbt-jinja-functions/as_native.md @@ -16,10 +16,3 @@ and [`as_number`](/reference/dbt-jinja-functions/as_number) instead. Unlike `as_bool` and `as_number`, `as_native` will return a rendered value regardless of the input type. Ensure that your inputs match expectations. ::: - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_native` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_number.md b/website/docs/reference/dbt-jinja-functions/as_number.md index 057d7ec8d20..29b35094880 100644 --- a/website/docs/reference/dbt-jinja-functions/as_number.md +++ b/website/docs/reference/dbt-jinja-functions/as_number.md @@ -25,10 +25,3 @@ my_profile: ``` - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_number` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_text.md b/website/docs/reference/dbt-jinja-functions/as_text.md index 5e19e5bc9bc..6b26cfa327d 100644 --- a/website/docs/reference/dbt-jinja-functions/as_text.md +++ b/website/docs/reference/dbt-jinja-functions/as_text.md @@ -56,12 +56,3 @@ models: ``` - - - -* `v0.17.0`: Native rendering is enabled by default. The `as_text` filter was -introduced. -* `v0.17.1`: Native rendering is disabled by default. The `as_text` filter works -as before, with no functional effect. - - diff --git a/website/docs/reference/dbt-jinja-functions/builtins.md b/website/docs/reference/dbt-jinja-functions/builtins.md index 40848705dc4..edc5f34ffda 100644 --- a/website/docs/reference/dbt-jinja-functions/builtins.md +++ b/website/docs/reference/dbt-jinja-functions/builtins.md @@ -1,10 +1,11 @@ --- -title: "About builtins Jinja function" +title: "About builtins Jinja variable" sidebar_label: "builtins" id: "builtins" -description: "Read this guide to understand the builtins Jinja function in dbt." +description: "Read this guide to understand the builtins Jinja variable in dbt." --- + The `builtins` variable exists to provide references to builtin dbt context methods. This allows macros to be created with names that _mask_ dbt builtin context methods, while still making those methods accessible in the dbt compilation context. The `builtins` variable is a dictionary containing the following keys: @@ -15,9 +16,51 @@ The `builtins` variable is a dictionary containing the following keys: ## Usage -The following macro overrides the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`. +:::important + +Using the `builtins` variable in this way is an advanced development workflow. Users should be ready to maintain and update these overrides when upgrading in the future. +::: + + + +From dbt v1.5 and higher, use the following macro to extract user-provided arguments, including version, and call the builtins.ref() function with either a single modelname argument or both packagename and modelname arguments, based on the number of positional arguments in varargs: + +

      + ``` +{% macro ref() %} +-- extract user-provided positional and keyword arguments + {% set version = kwargs.get('version') %} + {% set packagename = none %} + {%- if (varargs | length) == 1 -%} + {% set modelname = varargs[0] %} +{%- else -%} + {% set packagename = varargs[0] %} + {% set modelname = varargs[1] %} +{% endif %} +-- call builtins.ref based on provided positional arguments +{% set rel = None %} +{% if packagename is not none %} + {% set rel = return(builtins.ref(packagename, modelname, version=version)) %} +{% else %} + {% set rel = return(builtins.ref(modelname, version=version)) %} +{% endif %} + +-- finally, override the database name with "dev" +{% set newrel = rel.replace_path(database="dev") %} +{% do return(newrel) %} + +{% endmacro %} +``` +
      + + + +From dbt v1.4 and lower, use the following macro to override the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`: + +``` + {% macro ref(model_name) %} {% set rel = builtins.ref(model_name) %} @@ -26,6 +69,7 @@ The following macro overrides the `ref` method available in the model compilatio {% endmacro %} ``` + The ref macro can also be used to control which elements of the model path are rendered when run, for example the following macro overrides the `ref` method to render only the schema and object identifier, but not the database reference i.e. `my_schema.my_model` rather than `my_database.my_schema.my_model`. This is especially useful when using snowflake as a warehouse, if you intend to change the name of the database post-build and wish the references to remain accurate. diff --git a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md index e0701e5d091..0d377d29cef 100644 --- a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md +++ b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md @@ -1,22 +1,23 @@ --- -title: " About dbt_project.yml context variables" +title: " About dbt_project.yml context" sidebar_label: "dbt_project.yml context" id: "dbt-project-yml-context" -description: "The context variables and methods are available when configuring resources in the dbt_project.yml file." +description: "The context methods and variables available when configuring resources in the dbt_project.yml file." --- -The following context variables and methods are available when configuring +The following context methods and variables are available when configuring resources in the `dbt_project.yml` file. This applies to the `models:`, `seeds:`, and `snapshots:` keys in the `dbt_project.yml` file. +**Available context methods:** +- [env_var](/reference/dbt-jinja-functions/env_var) +- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) + **Available context variables:** - [target](/reference/dbt-jinja-functions/target) -- [env_var](/reference/dbt-jinja-functions/env_var) -- [vars](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) - [builtins](/reference/dbt-jinja-functions/builtins) - [dbt_version](/reference/dbt-jinja-functions/dbt_version) - ### Example configuration diff --git a/website/docs/reference/dbt-jinja-functions/dispatch.md b/website/docs/reference/dbt-jinja-functions/dispatch.md index a165ae59eb0..5dff787219f 100644 --- a/website/docs/reference/dbt-jinja-functions/dispatch.md +++ b/website/docs/reference/dbt-jinja-functions/dispatch.md @@ -5,12 +5,6 @@ id: "dispatch" description: "dbt extends functionality across data platforms using multiple dispatch." --- - - -- **v1.0.0:** The 'packages' argument is fully deprecated. Use `macro_namespace` and project-level `dispatch` config instead. - - - dbt can extend functionality across [Supported Data Platforms](/docs/supported-data-platforms) through a system of [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch). Because SQL syntax, data types, and / support vary across adapters, dbt can define and call generic functional macros, and then "dispatch" that macro to the appropriate implementation for the current adapter. ## Syntax diff --git a/website/docs/reference/dbt-jinja-functions/env_var.md b/website/docs/reference/dbt-jinja-functions/env_var.md index a5e9df82415..f4cc05cec0f 100644 --- a/website/docs/reference/dbt-jinja-functions/env_var.md +++ b/website/docs/reference/dbt-jinja-functions/env_var.md @@ -58,12 +58,6 @@ models: ### Secrets - - - **v1.0.0:** Restricted use of secret env vars to `profiles.yml` and `packages.yml` - - - For certain configurations, you can use "secret" env vars. Any env var named with the prefix `DBT_ENV_SECRET_` will be: - Available for use in `profiles.yml` + `packages.yml`, via the same `env_var()` function - Disallowed everywhere else, including `dbt_project.yml` and model SQL, to prevent accidentally writing these secret values to the or metadata artifacts @@ -82,12 +76,6 @@ host: "www.{{ env_var('DBT_ENV_SECRET_HOST_DOMAIN') }}.com/{{ env_var('DBT_ENV_S ### Custom metadata - - - - **v0.19.0:** Introduced `DBT_ENV_CUSTOM_ENV_` prefix and artifact `metadata.env` - - - Any env var named with the prefix `DBT_ENV_CUSTOM_ENV_` will be included in two places, with its prefix-stripped name as the key: - [dbt artifacts](/reference/artifacts/dbt-artifacts#common-metadata): `metadata` -> `env` - [events and structured logs](/reference/events-logging#info-fields): `info` -> `extra` diff --git a/website/docs/reference/dbt-jinja-functions/graph.md b/website/docs/reference/dbt-jinja-functions/graph.md index 3b3b4d1cb88..491b7836f45 100644 --- a/website/docs/reference/dbt-jinja-functions/graph.md +++ b/website/docs/reference/dbt-jinja-functions/graph.md @@ -99,7 +99,7 @@ representations of those nodes. A simplified example might look like: }, "exposures": { "exposure.my_project.traffic_dashboard": { - "unique_id": "source.my_project.traffic_dashboard", + "unique_id": "exposure.my_project.traffic_dashboard", "type": "dashboard", "maturity": "high", "path": "models/path/to/schema.yml", diff --git a/website/docs/reference/dbt-jinja-functions/log.md b/website/docs/reference/dbt-jinja-functions/log.md index ec4533ea621..30e68f8c21d 100644 --- a/website/docs/reference/dbt-jinja-functions/log.md +++ b/website/docs/reference/dbt-jinja-functions/log.md @@ -12,7 +12,34 @@ __Args__: Logs a line to either the log file or stdout. -([Source on GitHub](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/context/base.py#L432)) +
      + Code source + Refer to GitHub or the following code as a source:

      + +```python + def log(msg: str, info: bool = False) -> str: + """Logs a line to either the log file or stdout. + + :param msg: The message to log + :param info: If `False`, write to the log file. If `True`, write to + both the log file and stdout. + + > macros/my_log_macro.sql + + {% macro some_macro(arg1, arg2) %} + {{ log("Running some_macro: " ~ arg1 ~ ", " ~ arg2) }} + {% endmacro %}" + """ + if info: + fire_event(JinjaLogInfo(msg=msg, node_info=get_node_info())) + else: + fire_event(JinjaLogDebug(msg=msg, node_info=get_node_info())) + return "" +``` + + + +
      ```sql diff --git a/website/docs/reference/dbt-jinja-functions/model.md b/website/docs/reference/dbt-jinja-functions/model.md index e967debd01f..9ccf0759470 100644 --- a/website/docs/reference/dbt-jinja-functions/model.md +++ b/website/docs/reference/dbt-jinja-functions/model.md @@ -52,15 +52,9 @@ To view the structure of `models` and their definitions: Use the following table to understand how the versioning pattern works and match the Manifest version with the dbt version: -| dbt version | Manifest version | -| ----------- | ---------------- | -| `v1.5` | [Manifest v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html) -| `v1.4` | [Manifest v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html) -| `v1.3` | [Manifest v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html) -| `v1.2` | [Manifest v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html) -| `v1.1` | [Manifest v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html) - +import ManifestVersions from '/snippets/_manifest-versions.md'; + ## Related docs diff --git a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md index ff0f7c1ef33..32cd8ca10ff 100644 --- a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md +++ b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md @@ -100,12 +100,6 @@ on-run-end: ## Results - - -* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification. - - - The `results` variable contains a list of [Result objects](/reference/dbt-classes#result-objects) with one element per resource that executed in the dbt job. The Result object provides access within the Jinja on-run-end context to the information that will populate the [run results JSON artifact](/reference/artifacts/run-results-json). Example usage: diff --git a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md index 037a129476e..2a6390c3d12 100644 --- a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md +++ b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md @@ -1,16 +1,16 @@ --- -title: "About profiles.yml context variable" +title: "About profiles.yml context" sidebar_label: "profiles.yml context" id: "profiles-yml-context" -description: "Use these context variables to configure resources in `profiles.yml` file." +description: "Use these context methods to configure resources in `profiles.yml` file." --- -The following context variables and methods are available when configuring +The following context methods are available when configuring resources in the `profiles.yml` file. -**Available context variables:** +**Available context methods:** - [env_var](/reference/dbt-jinja-functions/env_var) -- [vars](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) +- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) ### Example usage diff --git a/website/docs/reference/dbt-jinja-functions/project_name.md b/website/docs/reference/dbt-jinja-functions/project_name.md index 38717aa16c3..7f76c5a4800 100644 --- a/website/docs/reference/dbt-jinja-functions/project_name.md +++ b/website/docs/reference/dbt-jinja-functions/project_name.md @@ -5,8 +5,6 @@ id: "project_name" description: "Read this guide to understand the project_name Jinja function in dbt." --- -New in 0.16.0 - The `project_name` context variable returns the `name` for the root-level project which is being run by dbt. This variable can be used to defer execution to a root-level project macro if one exists. diff --git a/website/docs/reference/dbt-jinja-functions/ref.md b/website/docs/reference/dbt-jinja-functions/ref.md index c500bb934ab..fda5992e234 100644 --- a/website/docs/reference/dbt-jinja-functions/ref.md +++ b/website/docs/reference/dbt-jinja-functions/ref.md @@ -29,11 +29,8 @@ from {{ref('model_a')}} `ref()` is, under the hood, actually doing two important things. First, it is interpolating the schema into your model file to allow you to change your deployment schema via configuration. Second, it is using these references between models to automatically build the dependency graph. This will enable dbt to deploy models in the correct order when using `dbt run`. -:::info New in 0.9.0 - -The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes at the [{{ this }}](/reference/dbt-jinja-functions/this) variable. - -::: +The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes as the [{{ this }} variable](/reference/dbt-jinja-functions/this). + - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`. ## Advanced ref usage @@ -73,7 +70,7 @@ select * from {{ ref('model_name') }} ### Two-argument variant -There is also a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity. +You can also use a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity. When using two arguments with projects (not packages), you also need to set [cross project dependencies](/docs/collaborate/govern/project-dependencies). ```sql select * from {{ ref('project_or_package', 'model_name') }} diff --git a/website/docs/reference/dbt-jinja-functions/run_query.md b/website/docs/reference/dbt-jinja-functions/run_query.md index cdd65a7b4dc..87970e024ed 100644 --- a/website/docs/reference/dbt-jinja-functions/run_query.md +++ b/website/docs/reference/dbt-jinja-functions/run_query.md @@ -15,7 +15,7 @@ Returns a [Table](https://agate.readthedocs.io/page/api/table.html) object with **Note:** The `run_query` macro will not begin a transaction automatically - if you wish to run your query inside of a transaction, please use `begin` and `commit ` statements as appropriate. :::info Using run_query for the first time? -Check out the section of the Getting Started guide on [using Jinja](/guides/advanced/using-jinja#dynamically-retrieve-the-list-of-payment-methods) for an example of working with the results of the `run_query` macro! +Check out the section of the Getting Started guide on [using Jinja](/guides/using-jinja#dynamically-retrieve-the-list-of-payment-methods) for an example of working with the results of the `run_query` macro! ::: **Example Usage:** diff --git a/website/docs/reference/dbt-jinja-functions/selected_resources.md b/website/docs/reference/dbt-jinja-functions/selected_resources.md index 80c4250b8d5..a927ec317ae 100644 --- a/website/docs/reference/dbt-jinja-functions/selected_resources.md +++ b/website/docs/reference/dbt-jinja-functions/selected_resources.md @@ -30,6 +30,8 @@ For a given run it will look like: ["model.my_project.model1", "model.my_project.model2", "snapshot.my_project.my_snapshot"] ``` +Each value corresponds to a key in the `nodes` object within the [graph](/reference/dbt-jinja-functions/graph) context variable. + It can be used in macros in a `pre-hook`, `post-hook`, `on-run-start` or `on-run-end` to evaluate what nodes are selected and trigger different logic whether a particular node is selected or not. diff --git a/website/docs/reference/dbt-jinja-functions/source.md b/website/docs/reference/dbt-jinja-functions/source.md index 2d73e79f09c..59317a79e3d 100644 --- a/website/docs/reference/dbt-jinja-functions/source.md +++ b/website/docs/reference/dbt-jinja-functions/source.md @@ -16,6 +16,7 @@ This function: - Creates dependencies between a source and the current model, which is useful for documentation and model selection - Compiles to the full object name in the database + ## Related guides - [Using sources](/docs/build/sources) diff --git a/website/docs/reference/dbt-jinja-functions/statement-blocks.md b/website/docs/reference/dbt-jinja-functions/statement-blocks.md index 1ad4f099aa3..2829ad3fe14 100644 --- a/website/docs/reference/dbt-jinja-functions/statement-blocks.md +++ b/website/docs/reference/dbt-jinja-functions/statement-blocks.md @@ -41,12 +41,6 @@ Once the statement block has executed, the result set is accessible via the `loa - `data`: Pythonic representation of data returned by query (arrays, tuples, dictionaries). - `table`: [Agate](https://agate.readthedocs.io/page/api/table.html) table representation of data returned by query. - - -* `v0.19.0`: The `response` structured object replaced a `status` string that contained similar information. - - - For the above statement, that could look like: diff --git a/website/docs/reference/dbt-jinja-functions/target.md b/website/docs/reference/dbt-jinja-functions/target.md index 7d6627c5a4b..e7d08db592f 100644 --- a/website/docs/reference/dbt-jinja-functions/target.md +++ b/website/docs/reference/dbt-jinja-functions/target.md @@ -7,7 +7,7 @@ description: "Contains information about your connection to the warehouse." `target` contains information about your connection to the warehouse. -* **dbt CLI:** These values are based on the target defined in your [`profiles.yml` file](/docs/core/connect-data-platform/profiles.yml) +* **dbt Core:** These values are based on the target defined in your [`profiles.yml` file](/docs/core/connect-data-platform/profiles.yml) * **dbt Cloud Scheduler:** * `target.name` is defined per job as described [here](/docs/build/custom-target-names). * For all other attributes, the values are defined by the deployment connection. To check these values, click **Deploy** from the upper left and select **Environments**. Then, select the relevant deployment environment, and click **Settings**. diff --git a/website/docs/reference/dbt-jinja-functions/this.md b/website/docs/reference/dbt-jinja-functions/this.md index 9065c660cb0..f9f2961b08f 100644 --- a/website/docs/reference/dbt-jinja-functions/this.md +++ b/website/docs/reference/dbt-jinja-functions/this.md @@ -3,13 +3,18 @@ title: "about this" sidebar_label: "this" id: "this" description: "Represents the current model in the database." +keywords: + - relation, relation object, this function, this jinja, this.database, this.schema, this.identifier +meta: + label: 'this' --- `this` is the database representation of the current model. It is useful when: - Defining a `where` statement within [incremental models](/docs/build/incremental-models) - Using [pre or post hooks](/reference/resource-configs/pre-hook-post-hook) -`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected. +`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected. + - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`. `this` can be thought of as equivalent to `ref('')`, and is a neat way to avoid circular dependencies. @@ -17,24 +22,6 @@ description: "Represents the current model in the database." - - -### Grant permissions on a model in a post-hook - - - -```yaml -models: - project-name: - +post-hook: - - "grant select on {{ this }} to db_reader" -``` - - - - - - ### Configuring incremental models @@ -54,3 +41,7 @@ from raw_app_data.events ``` + + + + \ No newline at end of file diff --git a/website/docs/reference/dbt_project.yml.md b/website/docs/reference/dbt_project.yml.md index 59541a81256..34af0f696c7 100644 --- a/website/docs/reference/dbt_project.yml.md +++ b/website/docs/reference/dbt_project.yml.md @@ -1,10 +1,5 @@ - -- **v1.0.0:** The default config name for `data-paths` is now [`seed-paths`](/reference/project-configs/seed-paths), `source-paths` is now [`model-paths`](/reference/project-configs/model-paths) and `modules-path` is now [`packages-install-path`](/reference/project-configs/packages-install-path). - - - -Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate on your project. +Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate your project. @@ -16,6 +11,8 @@ By default, dbt will look for `dbt_project.yml` in your current working director By default, dbt will look for `dbt_project.yml` in your current working directory and its parents, but you can set a different directory using the `--project-dir` flag or the `DBT_PROJECT_DIR` environment variable. +Starting from dbt v1.5 and higher, you can specify your dbt Cloud project ID in the `dbt_project.yml` file using `project-id` under the `dbt-cloud` config. To find your project ID, check your dbt Cloud project URL, such as `https://cloud.getdbt.com/11/projects/123456`, where the project ID is `123456`. + The following is a list of all available configurations in the `dbt_project.yml` file. @@ -24,6 +21,9 @@ The following is a list of all available configurations in the `dbt_project.yml` dbt uses YAML in a few different places. If you're new to YAML, it would be worth taking the time to learn how arrays, dictionaries and strings are represented. ::: + + + ```yml @@ -53,17 +53,27 @@ dbt uses YAML in a few different places. If you're new to YAML, it would be wort [require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range] +[dbt-cloud](/docs/cloud/cloud-cli-installation): + [project-id](/docs/cloud/configure-cloud-cli#configure-the-dbt-cloud-cli): project_id # Required + [defer-env-id](/docs/cloud/about-cloud-develop-defer#defer-in-dbt-cloud-cli): environment_id # Optional + [quoting](/reference/project-configs/quoting): database: true | false schema: true | false identifier: true | false +metrics: + + models: [](/reference/model-configs) seeds: [](/reference/seed-configs) +semantic-models: + + snapshots: [](/reference/snapshot-configs) @@ -84,6 +94,150 @@ vars: search_order: [packagename] [restrict-access](/docs/collaborate/govern/model-access): true | false + ``` + + + + + + +```yml +[name](/reference/project-configs/name): string + +[config-version](/reference/project-configs/config-version): 2 +[version](/reference/project-configs/version): version + +[profile](/reference/project-configs/profile): profilename + +[model-paths](/reference/project-configs/model-paths): [directorypath] +[seed-paths](/reference/project-configs/seed-paths): [directorypath] +[test-paths](/reference/project-configs/test-paths): [directorypath] +[analysis-paths](/reference/project-configs/analysis-paths): [directorypath] +[macro-paths](/reference/project-configs/macro-paths): [directorypath] +[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath] +[docs-paths](/reference/project-configs/docs-paths): [directorypath] +[asset-paths](/reference/project-configs/asset-paths): [directorypath] + +[target-path](/reference/project-configs/target-path): directorypath +[log-path](/reference/project-configs/log-path): directorypath +[packages-install-path](/reference/project-configs/packages-install-path): directorypath + +[clean-targets](/reference/project-configs/clean-targets): [directorypath] + +[query-comment](/reference/project-configs/query-comment): string + +[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range] + +[dbt-cloud](/docs/cloud/cloud-cli-installation): + [project-id](/docs/cloud/configure-cloud-cli#configure-the-dbt-cloud-cli): project_id # Required + [defer-env-id](/docs/cloud/about-cloud-develop-defer#defer-in-dbt-cloud-cli): environment_id # Optional + +[quoting](/reference/project-configs/quoting): + database: true | false + schema: true | false + identifier: true | false + +models: + [](/reference/model-configs) + +seeds: + [](/reference/seed-configs) + +snapshots: + [](/reference/snapshot-configs) + +sources: + [](source-configs) + +tests: + [](/reference/test-configs) + +vars: + [](/docs/build/project-variables) + +[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] +[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] + +[dispatch](/reference/project-configs/dispatch-config): + - macro_namespace: packagename + search_order: [packagename] + +[restrict-access](/docs/collaborate/govern/model-access): true | false + +``` + + + + + + + + + +```yml +[name](/reference/project-configs/name): string + +[config-version](/reference/project-configs/config-version): 2 +[version](/reference/project-configs/version): version + +[profile](/reference/project-configs/profile): profilename + +[model-paths](/reference/project-configs/model-paths): [directorypath] +[seed-paths](/reference/project-configs/seed-paths): [directorypath] +[test-paths](/reference/project-configs/test-paths): [directorypath] +[analysis-paths](/reference/project-configs/analysis-paths): [directorypath] +[macro-paths](/reference/project-configs/macro-paths): [directorypath] +[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath] +[docs-paths](/reference/project-configs/docs-paths): [directorypath] +[asset-paths](/reference/project-configs/asset-paths): [directorypath] + +[target-path](/reference/project-configs/target-path): directorypath +[log-path](/reference/project-configs/log-path): directorypath +[packages-install-path](/reference/project-configs/packages-install-path): directorypath + +[clean-targets](/reference/project-configs/clean-targets): [directorypath] + +[query-comment](/reference/project-configs/query-comment): string + +[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range] + +[quoting](/reference/project-configs/quoting): + database: true | false + schema: true | false + identifier: true | false + +models: + [](/reference/model-configs) + +seeds: + [](/reference/seed-configs) + +snapshots: + [](/reference/snapshot-configs) + +sources: + [](source-configs) + +tests: + [](/reference/test-configs) + +vars: + [](/docs/build/project-variables) + +[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] +[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] + +[dispatch](/reference/project-configs/dispatch-config): + - macro_namespace: packagename + search_order: [packagename] + +[restrict-access](/docs/collaborate/govern/model-access): true | false + +``` + + + + diff --git a/website/docs/reference/events-logging.md b/website/docs/reference/events-logging.md index dec1dafcb8e..ffdeb7bb752 100644 --- a/website/docs/reference/events-logging.md +++ b/website/docs/reference/events-logging.md @@ -4,7 +4,7 @@ title: "Events and logs" As dbt runs, it generates events. The most common way to see those events is as log messages, written in real time to two places: - The command line terminal (`stdout`), to provide interactive feedback while running dbt. -- The debug log file (`logs/dbt.log`), to enable detailed [debugging of errors](/guides/best-practices/debugging-errors) when they occur. The text-formatted log messages in this file include all `DEBUG`-level events, as well as contextual information, such as log level and thread name. The location of this file can be configured via [the `log_path` config](/reference/project-configs/log-path). +- The debug log file (`logs/dbt.log`), to enable detailed [debugging of errors](/guides/debug-errors) when they occur. The text-formatted log messages in this file include all `DEBUG`-level events, as well as contextual information, such as log level and thread name. The location of this file can be configured via [the `log_path` config](/reference/project-configs/log-path). diff --git a/website/docs/reference/exposure-properties.md b/website/docs/reference/exposure-properties.md index aaed2a20a09..0bd4cf771af 100644 --- a/website/docs/reference/exposure-properties.md +++ b/website/docs/reference/exposure-properties.md @@ -8,7 +8,11 @@ description: "Read this guide to understand exposure properties in dbt." - [Declaring resource properties](/reference/configs-and-properties) ## Overview -Exposures are defined in `.yml` files nested under an `exposures:` key. You may define `exposures` in YAML files that also define define `sources` or `models`. + +import PropsCallout from '/snippets/_config-prop-callout.md'; + +Exposures are defined in `properties.yml` files nested under an `exposures:` key. You may define `exposures` in YAML files that also define `sources` or `models`.
      + You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `models/` directory. diff --git a/website/docs/reference/global-configs/about-global-configs.md b/website/docs/reference/global-configs/about-global-configs.md index 42819cdac8f..9d1691812b5 100644 --- a/website/docs/reference/global-configs/about-global-configs.md +++ b/website/docs/reference/global-configs/about-global-configs.md @@ -8,4 +8,11 @@ Global configs enable you to fine-tune _how_ dbt runs projects on your machine Global configs control things like the visual output of logs, the manner in which dbt parses your project, and what to do when dbt finds a version mismatch or a failing model. These configs are "global" because they are available for all dbt commands, and because they can be set for all projects running on the same machine or in the same environment. -Starting in v1.0, you can set global configs in three places. When all three are set, command line flags take precedence, then environment variables, and last yaml configs (usually `profiles.yml`). \ No newline at end of file +### Global config precedence + +Starting in v1.0, you can set global configs in three places. dbt will evaluate the configs in the following order: +1. [user config](https://docs.getdbt.com/reference/global-configs/yaml-configurations) +1. [environment variable](https://docs.getdbt.com/reference/global-configs/environment-variable-configs) +1. [CLI flag](https://docs.getdbt.com/reference/global-configs/command-line-flags) + +Each config is prioritized over the previous one. For example, if all three are provided, then the CLI flag takes precedence. diff --git a/website/docs/reference/global-configs/cache.md b/website/docs/reference/global-configs/cache.md index db4eabd14b7..a605e1e70f3 100644 --- a/website/docs/reference/global-configs/cache.md +++ b/website/docs/reference/global-configs/cache.md @@ -17,7 +17,7 @@ There are two ways to optionally modify this behavior: For example, to quickly compile a model that requires no database metadata or introspective queries: ```text -dbt --skip-populate-cache compile --select my_model_name +dbt --no-populate-cache compile --select my_model_name ``` @@ -31,7 +31,7 @@ dbt --cache-selected-only run --select salesforce
      - + ### Cache database objects for selected resource @@ -63,4 +63,4 @@ config: - \ No newline at end of file + diff --git a/website/docs/reference/global-configs/command-line-flags.md b/website/docs/reference/global-configs/command-line-flags.md index 6496c92da6d..fbe89ce28f1 100644 --- a/website/docs/reference/global-configs/command-line-flags.md +++ b/website/docs/reference/global-configs/command-line-flags.md @@ -4,60 +4,95 @@ id: "command-line-flags" sidebar: "Command line flags" --- -Command line (CLI) flags immediately follow `dbt` and precede your subcommand. When set, CLI flags override environment variables and profile configs. +For consistency, command-line interface (CLI) flags should come right after the `dbt` prefix and its subcommands. This includes "global" flags (supported for all commands). When set, CLI flags override environment variables and profile configs. -Use this non-boolean config structure, replacing `` with the config you are enabling or disabling, `` with the new setting for the config, and `` with the command this config applies to: +For example, instead of using: + +```bash +dbt --no-populate-cache run +``` + +You should use: + +```bash +dbt run --no-populate-cache +``` + +Historically, passing flags (such as "global flags") _before_ the subcommand is a legacy functionality that dbt Labs can remove at any time. We do not support using the same flag before and after the subcommand. + +## Using boolean and non-boolean flags + +You can construct your commands with boolean flags to enable or disable or with non-boolean flags that use specific values, such as strings. + + + + + +Use this non-boolean config structure: +- Replacing `` with the command this config applies to. +- `` with the config you are enabling or disabling, and +- `` with the new setting for the config. ```text -$ --= + --= ``` -Non-boolean config examples: +### Example ```text -dbt --printer-width=80 run -dbt --indirect-selection=eager test +dbt run --printer-width=80 +dbt test --indirect-selection=eager ``` -To turn on boolean configs, you would use the `--` CLI flag, and a `--no-` CLI flag to turn off boolean configs, replacing `` with the config you are enabling or disabling and `` with the command this config applies to. + + + + +To enable or disable boolean configs: +- Use `` this config applies to. +- Followed by `--` to turn it on, or `--no-` to turn it off. +- Replace `` with the config you are enabling or disabling -Boolean config structure: ```text -dbt -- -dbt --no- +dbt -- +dbt --no- ``` -Boolean config example: +### Example ```text -dbt --version-check run -dbt --no-version-check run +dbt run --version-check +dbt run --no-version-check ``` - \ No newline at end of file + + + + + diff --git a/website/docs/reference/global-configs/logs.md b/website/docs/reference/global-configs/logs.md index f5f1b3f814b..8c819193fc6 100644 --- a/website/docs/reference/global-configs/logs.md +++ b/website/docs/reference/global-configs/logs.md @@ -14,6 +14,9 @@ The `LOG_FORMAT` config specifies how dbt's logs should be formatted. If the val dbt --log-format json run {"code": "A001", "data": {"v": "=1.0.0"}, "invocation_id": "1193e449-4b7a-4eb1-8e8e-047a8b3b7973", "level": "info", "log_version": 1, "msg": "Running with dbt=1.0.0", "node_info": {}, "pid": 35098, "thread_name": "MainThread", "ts": "2021-12-03T10:46:59.928217Z", "type": "log_line"} ``` + + + To set the `LOG_FORMAT_FILE` type output for the file without impacting the console log format, use the `log-format-file` flag. @@ -37,8 +40,6 @@ See [structured logging](/reference/events-logging#structured-logging) for more ::: - - ### Log Level @@ -124,7 +125,16 @@ dbt --quiet run ### Color -You can set the color preferences for the file logs only using the `--use-colors-file / --no-use-colors-file` flags. +You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags. + + + +```yaml +config: + use_colors_file: False +``` + + ```text dbt --use-colors-file run diff --git a/website/docs/reference/global-configs/print-output.md b/website/docs/reference/global-configs/print-output.md index 83280677229..112b92b546f 100644 --- a/website/docs/reference/global-configs/print-output.md +++ b/website/docs/reference/global-configs/print-output.md @@ -74,13 +74,24 @@ config: use_colors: False ``` + + ```text dbt --use-colors run dbt --no-use-colors run ``` -You can set the color preferences for the file logs only using the `--use-colors-file / --no-use-colors-file` flags. +You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags. + + + +```yaml +config: + use_colors_file: False +``` + + ```text dbt --use-colors-file run @@ -88,5 +99,3 @@ dbt --no-use-colors-file run ``` - - \ No newline at end of file diff --git a/website/docs/reference/global-configs/usage-stats.md b/website/docs/reference/global-configs/usage-stats.md index ea02fe0bb59..1f9492f4a43 100644 --- a/website/docs/reference/global-configs/usage-stats.md +++ b/website/docs/reference/global-configs/usage-stats.md @@ -8,14 +8,14 @@ We want to build the best version of dbt possible, and a crucial part of that is Usage statistics are fired when dbt is invoked and when models are run. These events contain basic platform information (OS + python version) and metadata such as whether the invocation succeeded, how long it took, an anonymized hash key representing the raw model content, and number of nodes that were run. You can see all the event definitions in [`tracking.py`](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/tracking.py). -By default this is turned on – you can opt out of event tracking at any time by adding the following to your `profiles.yml` file: +By default, this is enabled. dbt Core users can opt out of event tracking at any time by adding the following to your `profiles.yml` file: ```yaml config: send_anonymous_usage_stats: False ``` -You can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). +dbt Core users can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). `DO_NOT_TRACK=1` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=False` `DO_NOT_TRACK=0` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=True` diff --git a/website/docs/reference/macro-properties.md b/website/docs/reference/macro-properties.md index 91ba52de9ca..91a616ded0d 100644 --- a/website/docs/reference/macro-properties.md +++ b/website/docs/reference/macro-properties.md @@ -1,10 +1,13 @@ --- title: Macro properties +id: macro-properties --- -Macro properties can be declared in `.yml` files. +import PropsCallout from '/snippets/_config-prop-callout.md'; -You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders. +Macro properties can be declared in any `properties.yml` file. + +You can name these files `whatever_you_want.yml` and nest them arbitrarily deep in sub-folders. @@ -27,9 +30,3 @@ macros: ``` - - - -* `v0.16.0`: The ability to declare macro properties was introduced. - - diff --git a/website/docs/reference/model-properties.md b/website/docs/reference/model-properties.md index 730432c88af..63adc1f0d63 100644 --- a/website/docs/reference/model-properties.md +++ b/website/docs/reference/model-properties.md @@ -18,7 +18,7 @@ models: show: true | false [latest_version](/reference/resource-properties/latest_version): [deprecation_date](/reference/resource-properties/deprecation_date): - [access](/reference/resource-properties/access): private | protected | public + [access](/reference/resource-configs/access): private | protected | public [config](/reference/resource-properties/config): [](/reference/model-configs): [constraints](/reference/resource-properties/constraints): @@ -46,7 +46,7 @@ models: [description](/reference/resource-properties/description): [docs](/reference/resource-configs/docs): show: true | false - [access](/reference/resource-properties/access): private | protected | public + [access](/reference/resource-configs/access): private | protected | public [constraints](/reference/resource-properties/constraints): - [config](/reference/resource-properties/config): diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md index 6079e53793a..03c3b2aac12 100644 --- a/website/docs/reference/node-selection/defer.md +++ b/website/docs/reference/node-selection/defer.md @@ -2,13 +2,6 @@ title: "Defer" --- - - -- **v0.18.0**: Introduced `--defer` and `--state` flags as beta features. -- **v0.19.0**: Changed `--defer` to use the current environment's resource, if it exists, and only fall back to the other environment's resource if the first does not. Also added support for `dbt test --defer`. - - - Defer is a powerful feature that makes it possible to run a subset of models or tests in a [sandbox environment](/docs/environments-in-dbt) without having to first build their upstream parents. This can save time and computational resources when you want to test a small number of models in a large project. Defer requires that a manifest from a previous dbt invocation be passed to the `--state` flag or env var. Together with the `state:` selection method, these features enable "Slim CI". Read more about [state](/reference/node-selection/syntax#about-node-selection). @@ -24,16 +17,16 @@ It is possible to use separate state for `state:modified` and `--defer`, by pass ### Usage ```shell -$ dbt run --select [...] --defer --state path/to/artifacts -$ dbt test --select [...] --defer --state path/to/artifacts +dbt run --select [...] --defer --state path/to/artifacts +dbt test --select [...] --defer --state path/to/artifacts ``` ```shell -$ dbt run --models [...] --defer --state path/to/artifacts -$ dbt test --models [...] --defer --state path/to/artifacts +dbt run --models [...] --defer --state path/to/artifacts +dbt test --models [...] --defer --state path/to/artifacts ``` @@ -108,7 +101,7 @@ I want to test my changes. Nothing exists in my development schema, `dev_alice`. ```shell -$ dbt run --select model_b +dbt run --select "model_b" ``` @@ -135,7 +128,7 @@ Unless I had previously run `model_a` into this development environment, `dev_al ```shell -$ dbt run --select model_b --defer --state prod-run-artifacts +dbt run --select "model_b" --defer --state prod-run-artifacts ``` @@ -193,7 +186,7 @@ models: ```shell -dbt test --select model_b +dbt test --select "model_b" ``` @@ -218,7 +211,7 @@ The `relationships` test requires both `model_a` and `model_b`. Because I did no ```shell -dbt test --select model_b --defer --state prod-run-artifacts +dbt test --select "model_b" --defer --state prod-run-artifacts ``` diff --git a/website/docs/reference/node-selection/exclude.md b/website/docs/reference/node-selection/exclude.md index 9ad4bd1cc0e..d2c140d1bb5 100644 --- a/website/docs/reference/node-selection/exclude.md +++ b/website/docs/reference/node-selection/exclude.md @@ -7,19 +7,19 @@ sidebar_label: "Exclude" dbt provides an `--exclude` flag with the same semantics as `--select`. Models specified with the `--exclude` flag will be removed from the set of models selected with `--select`. ```bash -$ dbt run --select my_package.*+ --exclude my_package.a_big_model+ # select all models in my_package and their children except a_big_model and its children +dbt run --select "my_package".*+ --exclude "my_package.a_big_model+" # select all models in my_package and their children except a_big_model and its children ``` Exclude a specific resource by its name or lineage: ```bash # test -$ dbt test --exclude not_null_orders_order_id # test all models except the not_null_orders_order_id test -$ dbt test --exclude orders # test all models except tests associated with the orders model +dbt test --exclude "not_null_orders_order_id" # test all models except the not_null_orders_order_id test +dbt test --exclude "orders" # test all models except tests associated with the orders model # seed -$ dbt seed --exclude account_parent_mappings # load all seeds except account_parent_mappings +dbt seed --exclude "account_parent_mappings" # load all seeds except account_parent_mappings # snapshot -$ dbt snapshot --exclude snap_order_statuses # execute all snapshots except snap_order_statuses +dbt snapshot --exclude "snap_order_statuses" # execute all snapshots except snap_order_statuses ``` diff --git a/website/docs/reference/node-selection/graph-operators.md b/website/docs/reference/node-selection/graph-operators.md index 1e7c88fadfc..8cba43e1b52 100644 --- a/website/docs/reference/node-selection/graph-operators.md +++ b/website/docs/reference/node-selection/graph-operators.md @@ -7,9 +7,9 @@ If placed at the front of the model selector, `+` will select all parents of the ```bash - $ dbt run --select my_model+ # select my_model and all children - $ dbt run --select +my_model # select my_model and all parents - $ dbt run --select +my_model+ # select my_model, and all of its parents and children +dbt run --select "my_model+" # select my_model and all children +dbt run --select "+my_model" # select my_model and all parents +dbt run --select "+my_model+" # select my_model, and all of its parents and children ``` @@ -20,9 +20,9 @@ to step through. ```bash - $ dbt run --select my_model+1 # select my_model and its first-degree children - $ dbt run --select 2+my_model # select my_model, its first-degree parents, and its second-degree parents ("grandparents") - $ dbt run --select 3+my_model+4 # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree +dbt run --select "my_model+1" # select my_model and its first-degree children +dbt run --select "2+my_model" # select my_model, its first-degree parents, and its second-degree parents ("grandparents") +dbt run --select "3+my_model+4" # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree ``` @@ -32,14 +32,5 @@ The `@` operator is similar to `+`, but will also include _the parents of the ch ```bash -$ dbt run --models @my_model # select my_model, its children, and the parents of its children +dbt run --models @my_model # select my_model, its children, and the parents of its children ``` - -### The "star" operator -The `*` operator matches all models within a package or directory. - - - ```bash - $ dbt run --select snowplow.* # run all of the models in the snowplow package - $ dbt run --select finance.base.* # run all of the models in models/finance/base - ``` diff --git a/website/docs/reference/node-selection/methods.md b/website/docs/reference/node-selection/methods.md index ff86d60c06a..e29612e3401 100644 --- a/website/docs/reference/node-selection/methods.md +++ b/website/docs/reference/node-selection/methods.md @@ -34,8 +34,8 @@ The `tag:` method is used to select models that match a specified [tag](/referen ```bash - $ dbt run --select tag:nightly # run all models with the `nightly` tag - ``` +dbt run --select "tag:nightly" # run all models with the `nightly` tag +``` ### The "source" method @@ -43,16 +43,22 @@ The `source` method is used to select models that select from a specified [sourc ```bash - $ dbt run --select source:snowplow+ # run all models that select from Snowplow sources - ``` +dbt run --select "source:snowplow+" # run all models that select from Snowplow sources +``` ### The "resource_type" method -Use the `resource_type` method to select nodes of a particular type (`model`, `source`, `exposure`, etc). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list). +Use the `resource_type` method to select nodes of a particular type (`model`, `test`, `exposure`, and so on). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list). ```bash - $ dbt build --select resource_type:exposure # build all resources upstream of exposures - $ dbt list --select resource_type:test # list all tests in your project - ``` +dbt build --select "resource_type:exposure" # build all resources upstream of exposures +dbt list --select "resource_type:test" # list all tests in your project +``` + +Note: This method doesn't work for sources, so use the [`--resource-type`](/reference/commands/list) option of the list command instead: + + ```bash +dbt list --resource-type source +``` ### The "path" method The `path` method is used to select models/sources defined at or under a specific path. @@ -63,30 +69,39 @@ selectors unambiguous. ```bash # These two selectors are equivalent - dbt run --select path:models/staging/github - dbt run --select models/staging/github + dbt run --select "path:models/staging/github" + dbt run --select "models/staging/github" # These two selectors are equivalent - dbt run --select path:models/staging/github/stg_issues.sql - dbt run --select models/staging/github/stg_issues.sql + dbt run --select "path:models/staging/github/stg_issues.sql" + dbt run --select "models/staging/github/stg_issues.sql" ``` -### The "file" or "fqn" method -The `file` or `fqn` method can be used to select a model by its filename, including the file extension (`.sql`). +### The "file" method +The `file` method can be used to select a model by its filename, including the file extension (`.sql`). ```bash # These are equivalent -dbt run --select file:some_model.sql -dbt run --select some_model.sql -dbt run --select some_model -dbt run --select fqn:some_model # fqn is an abbreviation for "fully qualified name" +dbt run --select "file:some_model.sql" +dbt run --select "some_model.sql" +dbt run --select "some_model" ``` +### The "fqn" method + +The `fqn` method is used to select nodes based off their "fully qualified names" (FQN) within the dbt graph. The default output of [`dbt list`](/reference/commands/list) is a listing of FQN. + +```bash +dbt run --select "fqn:some_model" +dbt run --select "fqn:your_project.some_model" +dbt run --select "fqn:some_package.some_other_model" +``` + ### The "package" method The `package` method is used to select models defined within the root project @@ -96,10 +111,10 @@ selectors unambiguous. ```bash # These three selectors are equivalent - dbt run --select package:snowplow - dbt run --select snowplow - dbt run --select snowplow.* - ``` + dbt run --select "package:snowplow" + dbt run --select "snowplow" + dbt run --select "snowplow.*" +``` ### The "config" method @@ -109,10 +124,10 @@ The `config` method is used to select models that match a specified [node config ```bash - $ dbt run --select config.materialized:incremental # run all models that are materialized incrementally - $ dbt run --select config.schema:audit # run all models that are created in the `audit` schema - $ dbt run --select config.cluster_by:geo_country # run all models clustered by `geo_country` - ``` +dbt run --select "config.materialized:incremental" # run all models that are materialized incrementally +dbt run --select "config.schema:audit" # run all models that are created in the `audit` schema +dbt run --select "config.cluster_by:geo_country" # run all models clustered by `geo_country` +``` @@ -120,7 +135,8 @@ The `config` method is used to select models that match a specified [node config While most config values are strings, you can also use the `config` method to match boolean configs, dictionary keys, and values in lists. For example, given a model with the following configurations: -``` + +```bash {{ config( materialized = 'incremental', unique_key = ['column_a', 'column_b'], @@ -133,27 +149,24 @@ select ... You can select using any of the following: ```bash -$ dbt ls -s config.materialized:incremental -$ dbt ls -s config.unique_key:column_a -$ dbt ls -s config.grants.select:reporter -$ dbt ls -s config.transient:true +dbt ls -s config.materialized:incremental +dbt ls -s config.unique_key:column_a +dbt ls -s config.grants.select:reporter +dbt ls -s config.transient:true ``` ### The "test_type" method - -In v1.0.0, test types were renamed: "singular" (instead of "data") and "generic" (instead of "schema") - The `test_type` method is used to select tests based on their type, `singular` or `generic`: - ```bash - $ dbt test --select test_type:generic # run all generic tests - $ dbt test --select test_type:singular # run all singular tests - ``` +```bash +dbt test --select "test_type:generic" # run all generic tests +dbt test --select "test_type:singular" # run all singular tests +``` ### The "test_name" method @@ -164,10 +177,10 @@ that defines it. For more information about how generic tests are defined, read ```bash - $ dbt test --select test_name:unique # run all instances of the `unique` test - $ dbt test --select test_name:equality # run all instances of the `dbt_utils.equality` test - $ dbt test --select test_name:range_min_max # run all instances of a custom schema test defined in the local project, `range_min_max` - ``` +dbt test --select "test_name:unique" # run all instances of the `unique` test +dbt test --select "test_name:equality" # run all instances of the `dbt_utils.equality` test +dbt test --select "test_name:range_min_max" # run all instances of a custom schema test defined in the local project, `range_min_max` +``` ### The "state" method @@ -192,9 +205,9 @@ The `state` method is used to select nodes by comparing them against a previous ```bash - $ dbt test --select state:new # run all tests on new models + and new tests on old models - $ dbt run --select state:modified # run all models that have been modified - $ dbt ls --select state:modified # list all modified nodes (not just models) +dbt test --select "state:new " # run all tests on new models + and new tests on old models +dbt run --select "state:modified" # run all models that have been modified +dbt ls --select "state:modified" # list all modified nodes (not just models) ``` @@ -224,41 +237,32 @@ The `exposure` method is used to select parent resources of a specified [exposur ```bash - $ dbt run --select +exposure:weekly_kpis # run all models that feed into the weekly_kpis exposure - $ dbt test --select +exposure:* # test all resources upstream of all exposures - $ dbt ls --select +exposure:* --resource-type source # list all sources upstream of all exposures - ``` +dbt run --select "+exposure:weekly_kpis" # run all models that feed into the weekly_kpis exposure +dbt test --select "+exposure:*" # test all resources upstream of all exposures +dbt ls --select "+exposure:*" --resource-type source # list all sources upstream of all exposures +``` ### The "metric" method -New in v1.0.0 The `metric` method is used to select parent resources of a specified [metric](/docs/build/metrics). Use in conjunction with the `+` operator. ```bash -$ dbt build --select +metric:weekly_active_users # build all resources upstream of weekly_active_users metric -$ dbt ls --select +metric:* --resource-type source # list all source tables upstream of all metrics +dbt build --select "+metric:weekly_active_users" # build all resources upstream of weekly_active_users metric +dbt ls --select "+metric:*" --resource-type source # list all source tables upstream of all metrics ``` ### The "result" method -New in v1.0.0 The `result` method is related to the `state` method described above and can be used to select resources based on their result status from a prior run. Note that one of the dbt commands [`run`, `test`, `build`, `seed`] must have been performed in order to create the result on which a result selector operates. You can use `result` selectors in conjunction with the `+` operator. ```bash -$ dbt run --select result:error --state path/to/artifacts # run all models that generated errors on the prior invocation of dbt run -$ dbt test --select result:fail --state path/to/artifacts # run all tests that failed on the prior invocation of dbt test -$ dbt build --select 1+result:fail --state path/to/artifacts # run all the models associated with failed tests from the prior invocation of dbt build -$ dbt seed --select result:error --state path/to/artifacts # run all seeds that generated errors on the prior invocation of dbt seed. +dbt run --select "result:error" --state path/to/artifacts # run all models that generated errors on the prior invocation of dbt run +dbt test --select "result:fail" --state path/to/artifacts # run all tests that failed on the prior invocation of dbt test +dbt build --select "1+result:fail" --state path/to/artifacts # run all the models associated with failed tests from the prior invocation of dbt build +dbt seed --select "result:error" --state path/to/artifacts # run all seeds that generated errors on the prior invocation of dbt seed. ``` ### The "source_status" method - - -Supported in v1.1 or newer. - - - - Supported in v1.1 or higher. @@ -273,8 +277,8 @@ After issuing one of the above commands, you can reference the source freshness ```bash # You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. -$ dbt source freshness # must be run again to compare current to previous state -$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts +dbt source freshness # must be run again to compare current to previous state +dbt build --select "source_status:fresher+" --state path/to/prod/artifacts ``` @@ -283,16 +287,13 @@ $ dbt build --select source_status:fresher+ --state path/to/prod/artifacts ```bash # You can also set the DBT_STATE environment variable instead of the --state flag. -$ dbt source freshness # must be run again to compare current to previous state -$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts +dbt source freshness # must be run again to compare current to previous state +dbt build --select "source_status:fresher+" --state path/to/prod/artifacts ``` - - - ### The "group" method @@ -305,9 +306,9 @@ Supported in v1.5 or newer. The `group` method is used to select models defined within a [group](/reference/resource-configs/group). - ```bash - dbt run --select group:finance # run all models that belong to the finance group. - ``` +```bash +dbt run --select "group:finance" # run all models that belong to the finance group. +``` @@ -321,12 +322,12 @@ Supported in v1.5 or newer. -The `access` method selects models based on their [access](/reference/resource-properties/access) property. +The `access` method selects models based on their [access](/reference/resource-configs/access) property. ```bash -dbt list --select access:public # list all public models -dbt list --select access:private # list all private models -dbt list --select access:protected # list all protected models +dbt list --select "access:public" # list all public models +dbt list --select "access:private" # list all private models +dbt list --select "access:protected" # list all protected models ``` @@ -344,11 +345,26 @@ Supported in v1.5 or newer. The `version` method selects [versioned models](/docs/collaborate/govern/model-versions) based on their [version identifier](/reference/resource-properties/versions) and [latest version](/reference/resource-properties/latest_version). ```bash -dbt list --select version:latest # only 'latest' versions -dbt list --select version:prerelease # versions newer than the 'latest' version +dbt list --select "version:latest" # only 'latest' versions +dbt list --select "version:prerelease" # versions newer than the 'latest' version dbt list --select version:old # versions older than the 'latest' version -dbt list --select version:none # models that are *not* versioned +dbt list --select "version:none" # models that are *not* versioned ``` + +### The "semantic_model" method + +Supported in v1.6 or newer. + + + +The `semantic_model` method selects [semantic models](/docs/build/semantic-models). + +```bash +dbt list --select semantic_model:* # list all semantic models +dbt list --select +semantic_model:orders # list your semantic model named "orders" and all upstream resources +``` + + \ No newline at end of file diff --git a/website/docs/reference/node-selection/putting-it-together.md b/website/docs/reference/node-selection/putting-it-together.md index 8faf02e6cc9..48fc5188b32 100644 --- a/website/docs/reference/node-selection/putting-it-together.md +++ b/website/docs/reference/node-selection/putting-it-together.md @@ -4,16 +4,16 @@ title: "Putting it together" ```bash - $ dbt run --select my_package.*+ # select all models in my_package and their children - $ dbt run --select +some_model+ # select some_model and all parents and children +dbt run --select "my_package.*+" # select all models in my_package and their children +dbt run --select "+some_model+" # select some_model and all parents and children - $ dbt run --select tag:nightly+ # select "nightly" models and all children - $ dbt run --select +tag:nightly+ # select "nightly" models and all parents and children +dbt run --select "tag:nightly+" # select "nightly" models and all children +dbt run --select "+tag:nightly+" # select "nightly" models and all parents and children - $ dbt run --select @source:snowplow # build all models that select from snowplow sources, plus their parents +dbt run --select "@source:snowplow" # build all models that select from snowplow sources, plus their parents - $ dbt test --select config.incremental_strategy:insert_overwrite,test_name:unique # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy - ``` +dbt test --select "config.incremental_strategy:insert_overwrite,test_name:unique" # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy +``` @@ -22,8 +22,8 @@ and feed exports, while _excluding_ the biggest incremental models (and one othe ```bash - $ dbt run --select @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing - ``` +dbt run --select "@source:snowplow,tag:nightly models/export" --exclude "package:snowplow,config.materialized:incremental export_performance_timing" +``` This command selects all models that: diff --git a/website/docs/reference/node-selection/set-operators.md b/website/docs/reference/node-selection/set-operators.md index 7d6b6c2411c..af399b9cad5 100644 --- a/website/docs/reference/node-selection/set-operators.md +++ b/website/docs/reference/node-selection/set-operators.md @@ -11,7 +11,7 @@ Run snowplow_sessions, all ancestors of snowplow_sessions, fct_orders, and all a ```bash - $ dbt run --select +snowplow_sessions +fct_orders +dbt run --select "+snowplow_sessions +fct_orders" ``` ### Intersections @@ -22,15 +22,15 @@ Run all the common ancestors of snowplow_sessions and fct_orders: ```bash - $ dbt run --select +snowplow_sessions,+fct_orders - ``` +dbt run --select "+snowplow_sessions,+fct_orders" +``` Run all the common descendents of stg_invoices and stg_accounts: ```bash - $ dbt run --select stg_invoices+,stg_accounts+ +dbt run --select "stg_invoices+,stg_accounts+" ``` @@ -38,5 +38,5 @@ Run models that are in the marts/finance subdirectory *and* tagged nightly: ```bash - $ dbt run --select marts.finance,tag:nightly - ``` +dbt run --select "marts.finance,tag:nightly" +``` diff --git a/website/docs/reference/node-selection/state-comparison-caveats.md b/website/docs/reference/node-selection/state-comparison-caveats.md index 6ae156fddcf..73947c80a66 100644 --- a/website/docs/reference/node-selection/state-comparison-caveats.md +++ b/website/docs/reference/node-selection/state-comparison-caveats.md @@ -27,8 +27,8 @@ The command `dbt test -s state:modified` will include both: As long as you're adding or changing tests at the same time that you're adding or changing the resources (models, seeds, snapshots) they select from, all should work the way you expect with "simple" state selection: ```shell -$ dbt run -s state:modified -$ dbt test -s state:modified +dbt run -s "state:modified" +dbt test -s "state:modified" ``` This can get complicated, however. If you add a new test without modifying its underlying model, or add a test that selects from a new model and an old unmodified one, you may need to test a model without having first run it. @@ -36,8 +36,8 @@ This can get complicated, however. If you add a new test without modifying its u In v0.18.0, you needed to handle this by building the unmodified models needed for modified tests: ```shell -$ dbt run -s state:modified @state:modified,1+test_type:data -$ dbt test -s state:modified +dbt run -s "state:modified @state:modified,1+test_type:data" +dbt test -s "state:modified" ``` In v0.19.0, dbt added support for deferring upstream references when testing. If a test selects from a model that doesn't exist as a database object in your current environment, dbt will look to the other environment instead—the one defined in your state manifest. This enables you to use "simple" state selection without risk of query failure, but it may have some surprising consequences for tests with multiple parents. For instance, if you have a `relationships` test that depends on one modified model and one unmodified model, the test query will select from data "across" two different environments. If you limit or sample your data in development and CI, it may not make much sense to test for referential integrity, knowing there's a good chance of mismatch. @@ -45,8 +45,8 @@ In v0.19.0, dbt added support for deferring upstream references when testing. If If you're a frequent user of `relationships` tests or data tests, or frequently find yourself adding tests without modifying their underlying models, consider tweaking the selection criteria of your CI job. For instance: ```shell -$ dbt run -s state:modified -$ dbt test -s state:modified --exclude test_name:relationships +dbt run -s "state:modified" +dbt test -s "state:modified" --exclude "test_name:relationships" ``` ### False positives @@ -58,14 +58,7 @@ State comparison works by identifying discrepancies between two manifests. Thos dbt will do its best to capture *only* changes that are the result of modifications made in development. In projects with intricate env-aware logic, dbt will err on the side of running too many models (i.e. false positives). Over the next several versions of dbt, we're working on: - iterative improvements to dbt's built-in detective abilities -- better options for more complex projects, in the form of more-specific subselectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704)) - - - -- v0.18.0: All env-aware logic results in false positives during state comparison, when comparing against a manifest generated with a different target. -- v0.19.0: dbt stores and compares unrendered Jinja expressions for configurations, allowing it to see past env-aware logic in `dbt_project.yml`. - - +- better options for more complex projects, in the form of more-specific sub-selectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704)) State comparison is now able to detect env-aware config in `dbt_project.yml`. For instance, this target-based config would register as a modification in v0.18.0, but in v0.19.0 it no longer will: diff --git a/website/docs/reference/node-selection/syntax.md b/website/docs/reference/node-selection/syntax.md index 1a43a32e2bc..d0ea4a9acd8 100644 --- a/website/docs/reference/node-selection/syntax.md +++ b/website/docs/reference/node-selection/syntax.md @@ -14,6 +14,7 @@ dbt's node selection syntax makes it possible to run only specific resources in | [compile](/reference/commands/compile) | `--select`, `--exclude`, `--selector`, `--inline` | | [freshness](/reference/commands/source) | `--select`, `--exclude`, `--selector` | | [build](/reference/commands/build) | `--select`, `--exclude`, `--selector`, `--resource-type`, `--defer` | +| [docs generate](/reference/commands/cmd-docs) | `--select`, `--exclude`, `--selector` | :::info Nodes and resources @@ -24,6 +25,8 @@ We use the terms " By default, `dbt run` executes _all_ of the models in the dependency graph; `dbt seed` creates all seeds, `dbt snapshot` performs every snapshot. The `--select` flag is used to specify a subset of nodes to execute. +To follow [POSIX standards](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap12.html) and make things easier to understand, we recommend CLI users use quotes when passing arguments to the `--select` or `--exclude` option (including single or multiple space-delimited, or comma-delimited arguments). Not using quotes might not work reliably on all operating systems, terminals, and user interfaces. For example, `dbt run --select "my_dbt_project_name"` runs all models in your project. + ### How does selection work? 1. dbt gathers all the resources that are matched by one or more of the `--select` criteria, in the order of selection methods (e.g. `tag:`), then graph operators (e.g. `+`), then finally set operators ([unions](/reference/node-selection/set-operators#unions), [intersections](/reference/node-selection/set-operators#intersections), [exclusions](/reference/node-selection/exclude)). @@ -51,28 +54,28 @@ Examples: ```bash - $ dbt run --select my_dbt_project_name # runs all models in your project - $ dbt run --select my_dbt_model # runs a specific model - $ dbt run --select path.to.my.models # runs all models in a specific directory - $ dbt run --select my_package.some_model # run a specific model in a specific package - $ dbt run --select tag:nightly # run models with the "nightly" tag - $ dbt run --select path/to/models # run models contained in path/to/models - $ dbt run --select path/to/my_model.sql # run a specific model by its path +dbt run --select "my_dbt_project_name" # runs all models in your project +dbt run --select "my_dbt_model" # runs a specific model +dbt run --select "path.to.my.models" # runs all models in a specific directory +dbt run --select "my_package.some_model" # run a specific model in a specific package +dbt run --select "tag:nightly" # run models with the "nightly" tag +dbt run --select "path/to/models" # run models contained in path/to/models +dbt run --select "path/to/my_model.sql" # run a specific model by its path ``` dbt supports a shorthand language for defining subsets of nodes. This language uses the characters `+`, `@`, `*`, and `,`. ```bash - # multiple arguments can be provided to --select - $ dbt run --select my_first_model my_second_model +# multiple arguments can be provided to --select + dbt run --select "my_first_model my_second_model" - # these arguments can be projects, models, directory paths, tags, or sources - $ dbt run --select tag:nightly my_model finance.base.* +# these arguments can be projects, models, directory paths, tags, or sources +dbt run --select "tag:nightly my_model finance.base.*" - # use methods and intersections for more complex selectors - $ dbt run --select path:marts/finance,tag:nightly,config.materialized:table - ``` +# use methods and intersections for more complex selectors +dbt run --select "path:marts/finance,tag:nightly,config.materialized:table" +``` As your selection logic gets more complex, and becomes unwieldly to type out as command-line arguments, consider using a [yaml selector](/reference/node-selection/yaml-selectors). You can use a predefined definition with the `--selector` flag. @@ -93,7 +96,7 @@ by comparing code in the current project against the state manifest. - [Deferring](/reference/node-selection/defer) to another environment, whereby dbt can identify upstream, unselected resources that don't exist in your current environment and instead "defer" their references to the environment provided by the state manifest. - The [`dbt clone` command](/reference/commands/clone), whereby dbt can clone nodes based on their location in the manifest provided to the `--state` flag. -Together, the `state:` selector and deferral enable ["slim CI"](/guides/legacy/best-practices#run-only-modified-models-to-test-changes-slim-ci). We expect to add more features in future releases that can leverage artifacts passed to the `--state` flag. +Together, the `state:` selector and deferral enable ["slim CI"](/best-practices/best-practice-workflows#run-only-modified-models-to-test-changes-slim-ci). We expect to add more features in future releases that can leverage artifacts passed to the `--state` flag. ### Establishing state @@ -150,7 +153,7 @@ After issuing one of the above commands, you can reference the results by adding ```bash # You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. -$ dbt run --select result: --defer --state path/to/prod/artifacts +dbt run --select "result: --defer --state path/to/prod/artifacts" ``` The available options depend on the resource (node) type: @@ -169,19 +172,11 @@ The available options depend on the resource (node) type: The state and result selectors can also be combined in a single invocation of dbt to capture errors from a previous run OR any new or modified models. ```bash -$ dbt run --select result:+ state:modified+ --defer --state ./ +dbt run --select "result:+ state:modified+ --defer --state ./" ``` ### Fresh rebuilds - - -Only supported by v1.1 or newer. - - - - - Only supported by v1.1 or newer. When a job is selected, dbt Cloud will surface the artifacts from that job's most recent successful run. dbt will then use those artifacts to determine the set of fresh sources. In your job commands, you can signal to dbt to run and test only on these fresher sources and their children by including the `source_status:fresher+` argument. This requires both previous and current state to have the `sources.json` artifact be available. Or plainly said, both job states need to run `dbt source freshness`. @@ -191,21 +186,13 @@ As example: ```bash # Command step order dbt source freshness -dbt build --select source_status:fresher+ +dbt build --select "source_status:fresher+" ``` - -For more example commands, refer to [Pro-tips for workflows](/guides/legacy/best-practices.md#pro-tips-for-workflows). +For more example commands, refer to [Pro-tips for workflows](/best-practices/best-practice-workflows#pro-tips-for-workflows). ### The "source_status" status - - -Only supported by v1.1 or newer. - - - - Only supported by v1.1 or newer. @@ -218,7 +205,6 @@ After issuing one of the above commands, you can reference the source freshness ```bash # You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. -$ dbt source freshness # must be run again to compare current to previous state -$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts +dbt source freshness # must be run again to compare current to previous state +dbt build --select "source_status:fresher+" --state path/to/prod/artifacts ``` - diff --git a/website/docs/reference/node-selection/test-selection-examples.md b/website/docs/reference/node-selection/test-selection-examples.md index 85141c8cd01..feb3898c230 100644 --- a/website/docs/reference/node-selection/test-selection-examples.md +++ b/website/docs/reference/node-selection/test-selection-examples.md @@ -11,34 +11,22 @@ Like all resource types, tests can be selected **directly**, by methods and oper Unlike other resource types, tests can also be selected **indirectly**. If a selection method or operator includes a test's parent(s), the test will also be selected. [See below](#indirect-selection) for more details. - - - `v1.0.0`: Renamed the `--greedy` flag/property to `indirect_selection`, and set its default back to "eager" (pre-v0.20). You can achieve the "cautious" behavior introduced in v0.20 by setting the flag/property to `cautious`. - - - Test selection is powerful, and we know it can be tricky. To that end, we've included lots of examples below: ### Direct selection - - -`v1.0.0`: Renamed test types: "generic" (formerly "schema") and "singular" (formerly "data"). Removed support for the `--schema` and `--data` flags. - - - Run generic tests only: ```bash - $ dbt test --select test_type:generic + dbt test --select "test_type:generic" ``` Run singular tests only: ```bash - $ dbt test --select test_type:singular + dbt test --select "test_type:singular" ``` In both cases, `test_type` checks a property of the test itself. These are forms of "direct" test selection. @@ -99,8 +87,8 @@ By default, a test will run when ANY parent is selected; we call this "eager" in In this mode, any test that depends on unbuilt resources will raise an error. ```shell -$ dbt test --select orders -$ dbt build --select orders +dbt test --select "orders" +dbt build --select "orders" ```
      @@ -114,8 +102,10 @@ It will only include tests whose references are each within the selected nodes. Put another way, it will prevent tests from running if one or more of its parents is unselected. ```shell -$ dbt test --select orders --indirect-selection=cautious -$ dbt build --select orders --indirect-selection=cautious + +dbt test --select "orders" --indirect-selection=cautious +dbt build --select "orders" --indirect-selection=cautious + ``` @@ -134,8 +124,8 @@ By default, a test will run when ANY parent is selected; we call this "eager" in In this mode, any test that depends on unbuilt resources will raise an error. ```shell -$ dbt test --select orders -$ dbt build --select orders +dbt test --select "orders" +dbt build --select "orders" ``` @@ -149,8 +139,10 @@ It will only include tests whose references are each within the selected nodes. Put another way, it will prevent tests from running if one or more of its parents is unselected. ```shell -$ dbt test --select orders --indirect-selection=cautious -$ dbt build --select orders --indirect-selection=cautious + +dbt test --select "orders" --indirect-selection=cautious +dbt build --select "orders" --indirect-selection=cautious + ``` @@ -164,8 +156,9 @@ It will only include tests whose references are each within the selected nodes ( This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input). ```shell -$ dbt test --select orders --indirect-selection=buildable -$ dbt build --select orders --indirect-selection=buildable +dbt test --select "orders" --indirect-selection=buildable +dbt build --select "orders" --indirect-selection=buildable + ``` @@ -184,8 +177,8 @@ By default, a test will run when ANY parent is selected; we call this "eager" in In this mode, any test that depends on unbuilt resources will raise an error. ```shell -$ dbt test --select orders -$ dbt build --select orders +dbt test --select "orders" +dbt build --select "orders" ``` @@ -199,8 +192,9 @@ It will only include tests whose references are each within the selected nodes. Put another way, it will prevent tests from running if one or more of its parents is unselected. ```shell -$ dbt test --select orders --indirect-selection=cautious -$ dbt build --select orders --indirect-selection=cautious +dbt test --select "orders" --indirect-selection=cautious +dbt build --select "orders" --indirect-selection=cautious + ``` @@ -214,8 +208,8 @@ It will only include tests whose references are each within the selected nodes ( This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input). ```shell -$ dbt test --select orders --indirect-selection=buildable -$ dbt build --select orders --indirect-selection=buildable +dbt test --select "orders" --indirect-selection=buildable +dbt build --select "orders" --indirect-selection=buildable ``` @@ -225,8 +219,10 @@ $ dbt build --select orders --indirect-selection=buildable This mode will only include tests whose references are each within the selected nodes and will ignore all tests from attached nodes. ```shell -$ dbt test --select orders --indirect-selection=empty -$ dbt build --select orders --indirect-selection=empty + +dbt test --select "orders" --indirect-selection=empty +dbt build --select "orders" --indirect-selection=empty + ``` @@ -246,22 +242,25 @@ The following examples should feel somewhat familiar if you're used to executing ```bash # Run tests on a model (indirect selection) - $ dbt test --select customers + dbt test --select "customers" + + # Run tests on two or more specific models (indirect selection) + dbt test --select "customers orders" # Run tests on all models in the models/staging/jaffle_shop directory (indirect selection) - $ dbt test --select staging.jaffle_shop + dbt test --select "staging.jaffle_shop" # Run tests downstream of a model (note this will select those tests directly!) - $ dbt test --select stg_customers+ + dbt test --select "stg_customers+" # Run tests upstream of a model (indirect selection) - $ dbt test --select +stg_customers + dbt test --select "+stg_customers" # Run tests on all models with a particular tag (direct + indirect) - $ dbt test --select tag:my_model_tag + dbt test --select "tag:my_model_tag" # Run tests on all models with a particular materialization (indirect selection) - $ dbt test --select config.materialized:table + dbt test --select "config.materialized:table" ``` @@ -270,16 +269,20 @@ The following examples should feel somewhat familiar if you're used to executing ```bash # tests on all sources - $ dbt test --select source:* + + dbt test --select "source:*" # tests on one source - $ dbt test --select source:jaffle_shop + dbt test --select "source:jaffle_shop" + + # tests on two or more specific sources + dbt test --select "source:jaffle_shop source:raffle_bakery" # tests on one source table - $ dbt test --select source:jaffle_shop.customers + dbt test --select "source:jaffle_shop.customers" # tests on everything _except_ sources - $ dbt test --exclude source:* + dbt test --exclude "source:*" ``` ### More complex selection @@ -288,10 +291,12 @@ Through the combination of direct and indirect selection, there are many ways to ```bash - $ dbt test --select assert_total_payment_amount_is_positive # directly select the test by name - $ dbt test --select payments,test_type:singular # indirect selection, v1.2 - $ dbt test --select payments,test_type:data # indirect selection, v0.18.0 - $ dbt test --select payments --data # indirect selection, earlier versions + + dbt test --select "assert_total_payment_amount_is_positive" # directly select the test by name + dbt test --select "payments,test_type:singular" # indirect selection, v1.2 + dbt test --select "payments,test_type:data" # indirect selection, v0.18.0 + dbt test --select "payments" --data # indirect selection, earlier versions + ``` @@ -300,13 +305,14 @@ Through the combination of direct and indirect selection, there are many ways to ```bash # Run tests on all models with a particular materialization - $ dbt test --select config.materialized:table + dbt test --select "config.materialized:table" # Run tests on all seeds, which use the 'seed' materialization - $ dbt test --select config.materialized:seed + dbt test --select "config.materialized:seed" # Run tests on all snapshots, which use the 'snapshot' materialization - $ dbt test --select config.materialized:snapshot + dbt test --select "config.materialized:snapshot" + ``` Note that this functionality may change in future versions of dbt. @@ -324,8 +330,8 @@ models: - name: orders columns: - name: order_id - tests: tags: [my_column_tag] + tests: - unique ``` @@ -334,7 +340,8 @@ models: ```bash - $ dbt test --select tag:my_column_tag + dbt test --select "tag:my_column_tag" + ``` Currently, tests "inherit" tags applied to columns, sources, and source tables. They do _not_ inherit tags applied to models, seeds, or snapshots. In all likelihood, those tests would still be selected indirectly, because the tag selects its parent. This is a subtle distinction, and it may change in future versions of dbt. @@ -362,5 +369,6 @@ models: ```bash - $ dbt test --select tag:my_test_tag + dbt test --select "tag:my_test_tag" + ``` diff --git a/website/docs/reference/node-selection/yaml-selectors.md b/website/docs/reference/node-selection/yaml-selectors.md index 78342e32779..1e3f8d8d1e2 100644 --- a/website/docs/reference/node-selection/yaml-selectors.md +++ b/website/docs/reference/node-selection/yaml-selectors.md @@ -34,6 +34,7 @@ Each `definition` is comprised of one or more arguments, which can be one of the Use the `union` and `intersection` operator-equivalent keywords to organize multiple arguments. ### CLI-style + ```yml definition: 'tag:nightly' @@ -42,6 +43,7 @@ definition: This simple syntax supports use of the `+`, `@`, and `*` [graph](/reference/node-selection/graph-operators) operators, but it does not support [set](/reference/node-selection/set-operators) operators or `exclude`. ### Key-value + ```yml definition: tag: nightly @@ -317,7 +319,7 @@ selectors: Then in our job definition: ```bash -$ dbt run --selector nightly_diet_snowplow +dbt run --selector nightly_diet_snowplow ``` ## Default @@ -325,6 +327,7 @@ $ dbt run --selector nightly_diet_snowplow Selectors may define a boolean `default` property. If a selector has `default: true`, dbt will use this selector's criteria when tasks do not define their own selection criteria. Let's say we define a default selector that only selects resources defined in our root project: + ```yml selectors: - name: root_project_only @@ -338,16 +341,18 @@ selectors: ``` If I run an "unqualified" command, dbt will use the selection criteria defined in `root_project_only`—that is, dbt will only build / freshness check / generate compiled SQL for resources defined in my root project. + ``` -$ dbt build -$ dbt source freshness -$ dbt docs generate +dbt build +dbt source freshness +dbt docs generate ``` If I run a command that defines its own selection criteria (via `--select`, `--exclude`, or `--selector`), dbt will ignore the default selector and use the flag criteria instead. It will not try to combine the two. -``` -$ dbt run --select model_a -$ dbt run --exclude model_a + +```bash +dbt run --select "model_a" +dbt run --exclude model_a ``` Only one selector may set `default: true` for a given invocation; otherwise, dbt will return an error. You may use a Jinja expression to adjust the value of `default` depending on the environment, however: diff --git a/website/docs/reference/programmatic-invocations.md b/website/docs/reference/programmatic-invocations.md index 8bd9bf84047..dfd5bae09e6 100644 --- a/website/docs/reference/programmatic-invocations.md +++ b/website/docs/reference/programmatic-invocations.md @@ -2,7 +2,7 @@ title: "Programmatic invocations" --- -In v1.5, dbt-core added support for programmatic invocations. The intent is to expose the existing dbt CLI via a Python entry point, such that top-level commands are callable from within a Python script or application. +In v1.5, dbt-core added support for programmatic invocations. The intent is to expose the existing dbt Core CLI via a Python entry point, such that top-level commands are callable from within a Python script or application. The entry point is a `dbtRunner` class, which allows you to `invoke` the same commands as on the CLI. @@ -30,7 +30,7 @@ Each command returns a `dbtRunnerResult` object, which has three attributes: - `result`: If the command completed (successfully or with handled errors), its result(s). Return type varies by command. - `exception`: If the dbt invocation encountered an unhandled error and did not complete, the exception it encountered. -There is a 1:1 correspondence between [CLI exit codes](reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation: +There is a 1:1 correspondence between [CLI exit codes](/reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation: | Scenario | CLI Exit Code | `success` | `result` | `exception` | |---------------------------------------------------------------------------------------------|--------------:|-----------|-------------------|-------------| diff --git a/website/docs/reference/project-configs/asset-paths.md b/website/docs/reference/project-configs/asset-paths.md index 97204923cb9..1fb3cf9f260 100644 --- a/website/docs/reference/project-configs/asset-paths.md +++ b/website/docs/reference/project-configs/asset-paths.md @@ -15,12 +15,6 @@ asset-paths: [directorypath] ## Definition Optionally specify a custom list of directories to copy to the `target` directory as part of the `docs generate` command. This is useful for rendering images in your repository in your project documentation. - - -* `v0.18.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details. - - - ## Default By default, dbt will not copy any additional files as part of docs generate, i.e. `asset-paths: []` diff --git a/website/docs/reference/project-configs/clean-targets.md b/website/docs/reference/project-configs/clean-targets.md index 119630b00b1..9b464840723 100644 --- a/website/docs/reference/project-configs/clean-targets.md +++ b/website/docs/reference/project-configs/clean-targets.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [target_path] --- - - -- **v1.0.0:** The `modules-path` has been updated to be [`packages-install-path`](/reference/project-configs/packages-install-path). The default value has also been updated to be `dbt-packages` from `dbt-modules`. - - - ```yml diff --git a/website/docs/reference/project-configs/config-version.md b/website/docs/reference/project-configs/config-version.md index 20947c03d62..804caf1328f 100644 --- a/website/docs/reference/project-configs/config-version.md +++ b/website/docs/reference/project-configs/config-version.md @@ -20,12 +20,7 @@ config-version: 2 ## Definition Specify your `dbt_project.yml` as using the v2 structure. - - -* `v0.17.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details. -* `v1.5.0`: This configuration was made optional. - - + This configuration is optional. ## Default Without this configuration, dbt will assume your `dbt_project.yml` uses the version 1 syntax, which was deprecated in dbt v0.19.0. diff --git a/website/docs/reference/project-configs/log-path.md b/website/docs/reference/project-configs/log-path.md index daab17c5f10..29cad35d120 100644 --- a/website/docs/reference/project-configs/log-path.md +++ b/website/docs/reference/project-configs/log-path.md @@ -47,12 +47,21 @@ The precedence order is: CLI flag > env var > `dbt_project.yml` ## Examples -### Write logs to a subdirectory named `dbt_logs` instead of `logs` +### Specify subdirectory using the project config file ```yml log-path: dbt_logs ``` - + + + +### Specify subdirectory from the command line + +```bash +dbt --log-path dbt_logs run +``` + + diff --git a/website/docs/reference/project-configs/model-paths.md b/website/docs/reference/project-configs/model-paths.md index 2129747af27..a0652432787 100644 --- a/website/docs/reference/project-configs/model-paths.md +++ b/website/docs/reference/project-configs/model-paths.md @@ -2,11 +2,6 @@ datatype: [directorypath] default_value: [models] --- - - -- **v1.0.0:** The config `source-paths` has been deprecated in favor of `model-paths`. - - diff --git a/website/docs/reference/project-configs/on-run-start-on-run-end.md b/website/docs/reference/project-configs/on-run-start-on-run-end.md index 2c5cde4c0c2..e1a3d7b761a 100644 --- a/website/docs/reference/project-configs/on-run-start-on-run-end.md +++ b/website/docs/reference/project-configs/on-run-start-on-run-end.md @@ -4,6 +4,8 @@ description: "Read this guide to understand the on-run-start and on-run-end conf datatype: sql-statement | [sql-statement] --- +import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md'; + ```yml @@ -15,14 +17,8 @@ on-run-end: sql-statement | [sql-statement] ## Definition -A SQL statement (or list of SQL statements) to be run at the start, or end, of the following commands: -- `dbt run` -- `dbt test` -- `dbt seed` -- `dbt snapshot` -- `dbt build` -- `dbt compile` -- `dbt docs generate` + +A SQL statement (or list of SQL statements) to be run at the start or end of the following commands: `on-run-start` and `on-run-end` hooks can also call macros that return SQL statements @@ -33,34 +29,6 @@ A SQL statement (or list of SQL statements) to be run at the start, or end, of t - - -### Grant privileges at the end of a run - - - -```yml -on-run-end: "grant select on all tables in schema {{ target.schema }} group transformer" - -``` - - - -### Grant multiple privileges at the end of a run - - - -```yml -on-run-end: - - "grant usage on schema {{ target.schema }} to group reporter" - - "grant select on all tables in schema {{ target.schema }} group reporter" - -``` - - - - - ### Grant privileges on all schemas that dbt uses at the end of a run This leverages the [schemas](/reference/dbt-jinja-functions/schemas) variable that is only available in an `on-run-end` hook. diff --git a/website/docs/reference/project-configs/packages-install-path.md b/website/docs/reference/project-configs/packages-install-path.md index 98142305357..157c630fd36 100644 --- a/website/docs/reference/project-configs/packages-install-path.md +++ b/website/docs/reference/project-configs/packages-install-path.md @@ -3,12 +3,6 @@ datatype: directorypath default_value: dbt_packages --- - - -- **v1.0.0:** The default config has changed from `modules-path` to `packages-install-path` with a new default value of `dbt_packages`. - - - ```yml diff --git a/website/docs/reference/project-configs/query-comment.md b/website/docs/reference/project-configs/query-comment.md index 4d72bd4fcff..b1a73605e55 100644 --- a/website/docs/reference/project-configs/query-comment.md +++ b/website/docs/reference/project-configs/query-comment.md @@ -30,14 +30,6 @@ A string to inject as a comment in each query that dbt runs against your databas The `query-comment` configuration can also call a macro that returns a string. - - -* `v0.15.0`: The `query-comment` configuration was introduced -* `v0.16.1`: Dictionary syntax introduced to allow comments to be appended -* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels - - - ## Default By default, dbt will insert a comment at the top of your query containing the information including the dbt version, profile and target names, and node ids for the resources it runs. For example: @@ -149,13 +141,6 @@ select ... ### BigQuery: include query comment items as job labels - - - -* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels - - - If `query-comment.job-label` is set to true, dbt will include the query comment items, if a dictionary, or the comment string, as job labels on the query it executes. These will be included in addition to labels specified in the [BigQuery-specific config](/reference/project-configs/query-comment#bigquery-include-query-comment-items-as-job-labels). diff --git a/website/docs/reference/project-configs/quoting.md b/website/docs/reference/project-configs/quoting.md index 92968ace1bd..821b920188c 100644 --- a/website/docs/reference/project-configs/quoting.md +++ b/website/docs/reference/project-configs/quoting.md @@ -28,13 +28,6 @@ Note that for BigQuery quoting configuration, `database` and `schema` should be ::: - - -* `v0.10.1`: This configuration was introduced with a default value of `true` for each adapter. -* `v0.11.0`: The default quoting config on Snowflake changed from `true` to `false` - - - ## Default The default values vary by database. diff --git a/website/docs/reference/project-configs/require-dbt-version.md b/website/docs/reference/project-configs/require-dbt-version.md index 892495dde45..85a502bff60 100644 --- a/website/docs/reference/project-configs/require-dbt-version.md +++ b/website/docs/reference/project-configs/require-dbt-version.md @@ -19,12 +19,6 @@ When you set this configuration, dbt sends a helpful error message for any user If this configuration is not specified, no version check will occur. - - -* `v0.13.0`: This configuration was introduced - - - :::info YAML Quoting This configuration needs to be interpolated by the YAML parser as a string. As such, you should quote the value of the configuration, taking care to avoid whitespace. For example: diff --git a/website/docs/reference/project-configs/seed-paths.md b/website/docs/reference/project-configs/seed-paths.md index 92f7c5aa91f..614bda62cd2 100644 --- a/website/docs/reference/project-configs/seed-paths.md +++ b/website/docs/reference/project-configs/seed-paths.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [data] --- - - -- **v1.0.0:** The config `data-paths` has been deprecated in favor of `seed-paths`. - - - ```yml diff --git a/website/docs/reference/project-configs/snapshot-paths.md b/website/docs/reference/project-configs/snapshot-paths.md index a623d48b20f..81b2759609d 100644 --- a/website/docs/reference/project-configs/snapshot-paths.md +++ b/website/docs/reference/project-configs/snapshot-paths.md @@ -14,12 +14,6 @@ snapshot-paths: [directorypath] ## Definition Optionally specify a custom list of directories where [snapshots](/docs/build/snapshots) are located. Note that you cannot co-locate models and snapshots. - - -* `v0.14.0`: Snapshots were introduced - - - ## Default By default, dbt will search for snapshots in the `snapshots` directory, i.e. `snapshot-paths: ["snapshots"]` diff --git a/website/docs/reference/project-configs/target-path.md b/website/docs/reference/project-configs/target-path.md index 54458efe512..fddc5a93c5e 100644 --- a/website/docs/reference/project-configs/target-path.md +++ b/website/docs/reference/project-configs/target-path.md @@ -48,12 +48,22 @@ The precedence order is: CLI flag > env var > `dbt_project.yml` ## Examples -### Use a subdirectory named `compiled` for compiled files +### Specify subdirectory using the project config file ```yml -target-path: "compiled" +target-path: "compiled_files" ``` + + + +### Specify subdirectory from the command line + +```bash +dbt run --target-path compiled_files +``` + + \ No newline at end of file diff --git a/website/docs/reference/project-configs/test-paths.md b/website/docs/reference/project-configs/test-paths.md index e3f3cd2ccce..e3d0e0b76fa 100644 --- a/website/docs/reference/project-configs/test-paths.md +++ b/website/docs/reference/project-configs/test-paths.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [test] --- - - -* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory - - - ```yml diff --git a/website/docs/reference/project-configs/version.md b/website/docs/reference/project-configs/version.md index 4c128727445..890ad8542a7 100644 --- a/website/docs/reference/project-configs/version.md +++ b/website/docs/reference/project-configs/version.md @@ -1,17 +1,24 @@ --- datatype: version required: True +keyword: project version, project versioning, dbt project versioning --- - +import VersionsCallout from '/snippets/_version-callout.md'; -dbt projects have two distinct types of the `version` tags. This field has a different meaning depending on its location. + + + +dbt projects have two distinct types of `version` tags. This field has a different meaning depending on its location. ## `dbt_project.yml` versions -The version tag in a `dbt_project` file represents the version of your dbt project. Starting in version 1.5, `version` in the `dbt_project.yml` is an *optional parameter*. If specified, the version must be in a [semantic version](https://semver.org/) format, e.g. `1.0.0`. The default value if not specified is `None`. +The version tag in a `dbt_project` file represents the version of your dbt project. + +Starting in dbt version 1.5, `version` in the `dbt_project.yml` is an *optional parameter*. If used, the version must be in a [semantic version](https://semver.org/) format, such as `1.0.0`. The default value is `None` if not specified. For users on dbt version 1.4 or lower, this tag is required, though it isn't currently used meaningfully by dbt. For more on Core versions, see [About dbt Core versions](/docs/dbt-versions/core). + ```yml @@ -24,9 +31,9 @@ version: version A version tag in a `.yml` property file provides the control tag, which informs how dbt processes property files. -Starting from version 1.5, dbt will no longer require this configuration in your resource `.yml` files. If you want to know more about why this tag was previously required, you can refer to the [property file FAQs](reference/configs-and-properties#faqs). +Starting from version 1.5, dbt will no longer require this configuration in your resource `.yml` files. If you want to know more about why this tag was previously required, you can refer to the [FAQs](#faqs). For users on dbt version 1.4 or lower, this tag is required, -For more on property files, see their general [documentation](reference/configs-and-properties#where-can-i-define-properties) on the same page. +For more on property files, see their general [documentation](/reference/configs-and-properties#where-can-i-define-properties) on the same page. +## FAQS - - - - -dbt projects have two distinct types of `version` tags. This field has a different meaning depending on its location. - -## `dbt_project.yml` versions - -The version tag in a `dbt_project` file represents the version of your dbt project and **is a required parameter**. However, it isn't currently used in a meaningful way by dbt. The version must follow a [semantic version](https://semver.org/) format, such as 1.0.0. For more information about dbt Core versions, refer to [About dbt Core versions](/docs/dbt-versions/core). - - -```yml -version: version -``` - - - -## `.yml` property file versions - -A version tag in a `.yml` property file provides the control tag, which informs how dbt processes property files. For more on why we require this tag, see property file [FAQs](reference/configs-and-properties#faqs). - -For more on property files, see their general [documentation](reference/configs-and-properties#where-can-i-define-properties) on the same page. - - - -```yml -version: 2 # Only 2 is accepted by current and recent versions of dbt. - -models: - ... -``` - - - - + diff --git a/website/docs/reference/references-overview.md b/website/docs/reference/references-overview.md index 16afd01607c..91a228b6c3e 100644 --- a/website/docs/reference/references-overview.md +++ b/website/docs/reference/references-overview.md @@ -4,6 +4,8 @@ id: "references-overview" sidebar_label: "About References" description: "Connect dbt to any data platform in dbt Cloud or dbt Core, using a dedicated adapter plugin" hide_table_of_contents: true +pagination_next: null +pagination_prev: null --- The References section contains reference materials for developing with dbt, which includes dbt Cloud and dbt Core. @@ -49,9 +51,27 @@ Learn how to add more configurations to your dbt project or adapter, use propert icon="computer"/> + + + + + + diff --git a/website/docs/reference/resource-configs/access.md b/website/docs/reference/resource-configs/access.md new file mode 100644 index 00000000000..da50e48d2f0 --- /dev/null +++ b/website/docs/reference/resource-configs/access.md @@ -0,0 +1,97 @@ +--- +resource_types: [models] +datatype: access +--- + + + +```yml +version: 2 + +models: + - name: model_name + access: private | protected | public +``` + + + + + +Access modifiers may be applied to models one-by-one in YAML properties. In v1.5 and v1.6, you are unable to configure `access` for multiple models at once. Upgrade to v1.7 for additional configuration options. A group or subfolder contains models with varying access levels, so when you designate a model with `access: public`, make sure you intend for this behavior. + + + + + +You can apply access modifiers in config files, including `the dbt_project.yml`, or to models one-by-one in YAML properties. Applying access configs to a subfolder modifies the default for all models in that subfolder, so make sure you intend for this behavior. When setting individual model access, a group or subfolder might contain a variety of access levels, so when you designate a model with `access: public` make sure you intend for this behavior. + +There are multiple approaches to configuring access: + +In the model configs of `dbt_project.yml``: + +```yaml +models: + - name: my_public_model + access: public # Older method, still supported + +``` +Or (but not both) + +```yaml +models: + - name: my_public_model + config: + access: public # newly supported in v1.7 + +``` + +In a subfolder: +```yaml +models: + my_project_name: + subfolder_name: + +group: + +access: private # sets default for all models in this subfolder +``` + +In the model.sql file: + +```sql +-- models/my_public_model.sql + +{{ config(access = "public") }} + +select ... +``` + + + +## Definition +The access level of the model you are declaring properties for. + +Some models (not all) are designed to be referenced through the [ref](/reference/dbt-jinja-functions/ref) function across [groups](/docs/build/groups). + +| Access | Referenceable by | +|-----------|-------------------------------| +| private | same group | +| protected | same project/package | +| public | any group, package or project | + +If you try to reference a model outside of its supported access, you will see an error: + +```shell +dbt run -s marketing_model +... +dbt.exceptions.DbtReferenceError: Parsing Error + Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, + which is not allowed because the referenced node is private to the finance group. +``` + +## Default + +By default, all models are "protected." This means that other models in the same project can reference them. + +## Related docs + +* [Model Access](/docs/collaborate/govern/model-access#groups) +* [Group configuration](/reference/resource-configs/group) diff --git a/website/docs/reference/resource-configs/alias.md b/website/docs/reference/resource-configs/alias.md index 40da45ebcd1..6b7588ecaf7 100644 --- a/website/docs/reference/resource-configs/alias.md +++ b/website/docs/reference/resource-configs/alias.md @@ -1,33 +1,50 @@ --- resource_types: [models, seeds, snapshots, tests] -description: "Read this guide to understand the alias configuration in dbt." +description: "Aliasing a resource lets you give it a custom name in the database instead of using the filename." datatype: string --- -:::caution Heads up! -This is a work in progress document. While this configuration applies to multiple resource types, the documentation has only been written for seeds. -::: + + -## Definition +Specify a custom alias for a model in your `dbt_project.yml` file or config block. -Optionally specify a custom alias for a [model](/docs/build/models) or [seed](/docs/build/seeds). +For example, if you have a model that calculates `sales_total` and want to give it a more user-friendly alias, you can alias it like this: -When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` + -The standard behavior of dbt is: -* If a custom alias is _not_ specified, the identifier of the relation is the resource name (i.e. the filename). -* If a custom alias is specified, the identifier of the relation is the `{{ alias }}` value. +```yml +models: + your_project: + sales_total: + +alias: sales_dashboard +``` + -To learn more about changing the way that dbt generates a relation's `identifier`, read [Using Aliases](/docs/build/custom-aliases). +This would return `analytics.finance.sales_dashboard` in the database, instead of the default `analytics.finance.sales_total`. + + + -## Usage -### Seeds -Configure a seed's alias in your `dbt_project.yml` file. +Configure a seed's alias in your `dbt_project.yml` file or config block. -The seed at `seeds/country_codes.csv` will be built as a named `country_mappings`. +For example, if you have a seed that represents `product_categories` and want to alias it as `categories_data`, you would alias like this: + + + +```yml +seeds: + your_project: + product_categories: + +alias: categories_data +``` + +This would return the name `analytics.finance.categories_data` in the database. + +In the following second example, the seed at `seeds/country_codes.csv` will be built as a named `country_mappings`. @@ -40,3 +57,68 @@ seeds: ``` + + + + + + +Configure a seed's alias in your `dbt_project.yml` file or config block. + +For example, if you have a snapshot that represents `your_snapshot` and want to alias it as `updated_at_id`, you would alias like this: + + + +```yml +snapshots: + - name: your_snapshot + config: + target_database: analytics + target_schema: finance + unique_key: id + strategy: timestamp + updated_at: updated_at + alias: your_snapshot +``` + +This would return the name `analytics.finance.your_snapshot` in the database. + + + + + + +Configure a test's alias in your `schema.yml` file or config block. + +For example, to add a unique test to the `order_id` column and give it an alias `unique_order_id_test` to identify this specific test, you would alias like this: + + + +```yml +models: + - name: orders + columns: + - name: order_id + tests: + - unique + alias: unique_order_id_test +``` + +When using `--store-failures`, this would return the name `analytics.finance.orders_order_id_unique_order_id_test` in the database. + + + + + +## Definition + +Optionally specify a custom alias for a [model](/docs/build/models), [tests](/docs/build/tests), [snapshots](/docs/build/snapshots), or [seed](/docs/build/seeds). + +When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` + +The standard behavior of dbt is: +* If a custom alias is _not_ specified, the identifier of the relation is the resource name (i.e. the filename). +* If a custom alias is specified, the identifier of the relation is the `{{ alias }}` value. + +To learn more about changing the way that dbt generates a relation's `identifier`, read [Using Aliases](/docs/build/custom-aliases). + diff --git a/website/docs/reference/resource-configs/bigquery-configs.md b/website/docs/reference/resource-configs/bigquery-configs.md index c425fd5b94b..ffbaa37c059 100644 --- a/website/docs/reference/resource-configs/bigquery-configs.md +++ b/website/docs/reference/resource-configs/bigquery-configs.md @@ -21,26 +21,6 @@ This will allow you to read and write from multiple BigQuery projects. Same for ### Partition clause - - -Before dbt v0.16.0, the `partition_by` configuration was supplied as string. While -the string specification syntax is still supported in dbt v0.16.0, it has been -deprecated and will be removed in a future release. **Note:** partitioning configs -using a range bucket *must* be supplied using the dictionary-style configuration as of -dbt v0.16.0. - -Example usage for versions of dbt < 0.16.0: - -```sql --- Partitioning by a timestamp field -{{ config( materialized='table', partition_by="date(created_at)" ) }} - --- Partitioning by a date field -{{ config( materialized='table', partition_by="created_date" ) }} -``` - - - BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#pruning_limiting_partitions) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). The `partition_by` config can be supplied as a dictionary with the following format: @@ -61,7 +41,6 @@ The `partition_by` config can be supplied as a dictionary with the following for ``` #### Partitioning by a date or timestamp -Partitioning by hour, month or year is new in v0.19.0 When using a `datetime` or `timestamp` column to partition data, you can create partitions with a granularity of hour, day, month, or year. A `date` column supports granularity of day, month and year. Daily partitioning is the default for all column types. @@ -266,12 +245,6 @@ as ( #### Additional partition configs - - - - **v0.20.0:** Introduced `require_partition_filter` and `partition_expiration_days` - - - If your model has `partition_by` configured, you may optionally specify two additional configurations: - `require_partition_filter` (boolean): If set to `true`, anyone querying this model _must_ specify a partition filter, otherwise their query will fail. This is recommended for very large tables with obvious partitioning schemes, such as event streams grouped by day. Note that this will affect other dbt models or tests that try to select from this model, too. @@ -367,11 +340,7 @@ dbt supports the specification of BigQuery labels for the tables and BigQuery key-value pair entries for labels larger than 63 characters are truncated. **Configuring labels in a model file** @@ -445,7 +414,7 @@ models: columns: - name: field policy_tags: - - 'projects//locations//taxonomies//policyTags/' + - 'projects//locations//taxonomies//policyTags/' ``` @@ -489,12 +458,6 @@ strategy is selected. ### The `insert_overwrite` strategy - - - - **v0.16.0:** Introduced `insert_overwrite` incremental strategy - - - The `insert_overwrite` strategy generates a merge statement that replaces entire partitions in the destination table. **Note:** this configuration requires that the model is configured with a [Partition clause](#partition-clause). The `merge` statement that dbt generates @@ -587,12 +550,6 @@ _today_ and _yesterday_ every day that it is run. It is the fastest and cheapest way to incrementally update a table using dbt. If we wanted this to run more dynamically— let’s say, always for the past 3 days—we could leverage dbt’s baked-in [datetime macros](https://github.com/dbt-labs/dbt-core/blob/dev/octavius-catto/core/dbt/include/global_project/macros/etc/datetime.sql) and write a few of our own. - - - - **v0.19.0:** With the advent of truncated timestamp partitions in BigQuery, `timestamp`-type partitions are now treated as timestamps instead of dates for the purposes of filtering. Update `partitions_to_replace` accordingly. - - - Think of this as "full control" mode. You must ensure that expressions or literal values in the the `partitions` config have proper quoting when templated, and that they match the `partition_by.data_type` (`timestamp`, `datetime`, `date`, or `int64`). Otherwise, the filter in the incremental `merge` statement will raise an error. #### Dynamic partitions @@ -685,7 +642,6 @@ from {{ ref('events') }} ## Controlling table expiration -New in v0.18.0 By default, dbt-created tables never expire. You can configure certain model(s) to expire after a set number of hours by setting `hours_to_expiration`. @@ -721,8 +677,6 @@ select ... ## Authorized Views -New in v0.18.0 - If the `grant_access_to` config is specified for a model materialized as a view, dbt will grant the view model access to select from the list of datasets provided. See [BQ docs on authorized views](https://cloud.google.com/bigquery/docs/share-access-views) @@ -764,48 +718,3 @@ Views with this configuration will be able to select from objects in `project_1. The `grant_access_to` config is not thread-safe when multiple views need to be authorized for the same dataset. The initial `dbt run` operation after a new `grant_access_to` config is added should therefore be executed in a single thread. Subsequent runs using the same configuration will not attempt to re-apply existing access grants, and can make use of multiple threads. - - - -## Materialized view - -The BigQuery adapter supports [materialized views](https://cloud.google.com/bigquery/docs/materialized-views-intro) and refreshes them for every subsequent `dbt run` you execute. For more information, see [Refresh Materialized Views](https://cloud.google.com/bigquery/docs/materialized-views-manage#refresh) in the Google docs. - -Materialized views support the optional configuration `on_configuration_change` with the following values: -- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The following changes can be applied without the need to rebuild the materialized view: - - enable_refresh - - refresh_interval_minutes - - max_staleness -- `skip` — allows runs to continue while also providing a warning that the model was skipped -- `fail` — forces runs to fail if a change is detected in a materialized view - -You can create a materialized view by editing _one_ of these files: -- the SQL file for your model -- the `dbt_project.yml` configuration file - -The following examples create a materialized view: - - - -```sql -{{ - config( - materialized = 'materialized_view', - on_configuration_change = 'apply', - ) -}} -``` - - - - - - -```yaml -models: - path: - materialized: materialized_view -``` - - - diff --git a/website/docs/reference/resource-configs/contract.md b/website/docs/reference/resource-configs/contract.md index 66072fc8b89..ccc10099a12 100644 --- a/website/docs/reference/resource-configs/contract.md +++ b/website/docs/reference/resource-configs/contract.md @@ -23,11 +23,34 @@ When the `contract` configuration is enforced, dbt will ensure that your model's This is to ensure that the people querying your model downstream—both inside and outside dbt—have a predictable and consistent set of columns to use in their analyses. Even a subtle change in data type, such as from `boolean` (`true`/`false`) to `integer` (`0`/`1`), could cause queries to fail in surprising ways. + + The `data_type` defined in your YAML file must match a data type your data platform recognizes. dbt does not do any type aliasing itself. If your data platform recognizes both `int` and `integer` as corresponding to the same type, then they will return a match. -When dbt is comparing data types, it will not compare granular details such as size, precision, or scale. We don't think you should sweat the difference between `varchar(256)` and `varchar(257)`, because it doesn't really affect the experience of downstream queriers. If you need a more-precise assertion, it's always possible to accomplish by [writing or using a custom test](/guides/best-practices/writing-custom-generic-tests). + + + + +dbt uses built-in type aliasing for the `data_type` defined in your YAML. For example, you can specify `string` in your contract, and on Postgres/Redshift, dbt will convert it to `text`. If dbt doesn't recognize the `data_type` name among its known aliases, it will pass it through as-is. This is enabled by default, but you can opt-out by setting `alias_types` to `false`. + +Example for disabling: + +```yml + +models: + - name: my_model + config: + contract: + enforced: true + alias_types: false # true by default + +``` + + + +When dbt compares data types, it will not compare granular details such as size, precision, or scale. We don't think you should sweat the difference between `varchar(256)` and `varchar(257)`, because it doesn't really affect the experience of downstream queriers. You can accomplish a more-precise assertion by [writing or using a custom test](/best-practices/writing-custom-generic-tests). -That said, on certain data platforms, you will need to specify a varchar size or numeric scale if you do not want it to revert to the default. This is most relevant for the `numeric` type on Snowflake, which defaults to a precision of 38 and a scale of 0 (zero digits after the decimal, such as rounded to an integer). To avoid this implicit coercion, specify your `data_type` with a nonzero scale, like `numeric(38, 6)`. +Note that you need to specify a varchar size or numeric scale, otherwise dbt relies on default values. For example, if a `numeric` type defaults to a precision of 38 and a scale of 0, then the numeric column stores 0 digits to the right of the decimal (it only stores whole numbers), which might cause it to fail contract enforcement. To avoid this implicit coercion, specify your `data_type` with a nonzero scale, like `numeric(38, 6)`. dbt Core 1.7 and higher provides a warning if you don't specify precision and scale when providing a numeric data type. ## Example @@ -47,6 +70,8 @@ models: - type: not_null - name: customer_name data_type: string + - name: non_integer + data_type: numeric(38,3) ``` @@ -95,32 +120,3 @@ Imagine: - The result is a delta between the yaml-defined contract, and the actual table in the database - which means the contract is now incorrect! Why `append_new_columns`, rather than `sync_all_columns`? Because removing existing columns is a breaking change for contracted models! - -### Detecting breaking changes - -When you use the `state:modified` selection method in Slim CI, dbt will detect changes to model contracts, and raise an error if any of those changes could be breaking for downstream consumers. - -Breaking changes include: -- Removing an existing column -- Changing the `data_type` of an existing column -- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) - -``` -Breaking Change to Contract Error in model sometable (models/sometable.sql) - While comparing to previous project state, dbt detected a breaking change to an enforced contract. - - The contract's enforcement has been disabled. - - Columns were removed: - - order_name - - Columns with data_type changes: - - order_id (number -> int) - - Consider making an additive (non-breaking) change instead, if possible. - Otherwise, create a new model version: https://docs.getdbt.com/docs/collaborate/govern/model-versions -``` - -Additive changes are **not** considered breaking: -- Adding a new column to a contracted model -- Adding new `constraints` to an existing column in a contracted model diff --git a/website/docs/reference/resource-configs/database.md b/website/docs/reference/resource-configs/database.md index 9b7cd1b39d3..7d91358ff01 100644 --- a/website/docs/reference/resource-configs/database.md +++ b/website/docs/reference/resource-configs/database.md @@ -2,45 +2,86 @@ sidebar_label: "database" resource_types: [models, seeds, tests] datatype: string -description: "Read this guide to understand the database configuration in dbt." +description: "Override the default database when dbt creates resources in your data platform." --- -:::caution Heads up! -This is a work in progress document. While this configuration applies to multiple resource types, the documentation has only been written for seeds. + + -::: +Specify a custom database for a model in your `dbt_project.yml` file. -## Definition +For example, if you have a model that you want to load into a database other than the target database, you can configure it like this: -Optionally specify a custom database for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)). + -When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` +```yml +models: + your_project: + sales_metrics: + +database: reporting +``` + -The standard behavior of dbt is: -* If a custom database is _not_ specified, the database of the relation is the target database (`{{ target.database }}`). -* If a custom database is specified, the database of the relation is the `{{ database }}` value. +This would result in the generated relation being located in the `reporting` database, so the full relation name would be `reporting.finance.sales_metrics` instead of the default target database. + -To learn more about changing the way that dbt generates a relation's `database`, read [Using Custom Databases](/docs/build/custom-databases) + - +Configure a database in your `dbt_project.yml` file. -* `v0.13.0`: Support for the `database` config is added -* `v0.16.0`: The `generate_database_name` macro was added to control how the `database` config is used by dbt +For example, to load a seed into a database called `staging` instead of the target database, you can configure it like this: - - -## Usage -### Load seeds into the RAW database ```yml seeds: - +database: RAW + your_project: + product_categories: + +database: staging +``` + +This would result in the generated relation being located in the `staging` database, so the full relation name would be `staging.finance.product_categories`. + + + + + +Configure a database in your `dbt_project.yml` file. + +For example, to load a test into a database called `reporting` instead of the target database, you can configure it like this: + + + +```yml +tests: + - my_not_null_test: + column_name: order_id + type: not_null + +database: reporting ``` +This would result in the generated relation being located in the `reporting` database, so the full relation name would be `reporting.finance.my_not_null_test`. + + + + + +## Definition + +Optionally specify a custom database for a [model](/docs/build/sql-models), [seed](/docs/build/seeds), or [tests](/docs/build/tests). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)). + +When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` + +The standard behavior of dbt is: +* If a custom database is _not_ specified, the database of the relation is the target database (`{{ target.database }}`). +* If a custom database is specified, the database of the relation is the `{{ database }}` value. + +To learn more about changing the way that dbt generates a relation's `database`, read [Using Custom Databases](/docs/build/custom-databases) + + ## Warehouse specific information * BigQuery: `project` and `database` are interchangeable -* Redshift: Cross-database queries are not possible in Redshift. As such, dbt will return a Database Error if you use 'Database A' for a seed file and try to `{{ ref() }}` that seed file (or its database object) in a model using 'Database B'. This error would only be found at runtime. + diff --git a/website/docs/reference/resource-configs/databricks-configs.md b/website/docs/reference/resource-configs/databricks-configs.md index 41b0bfcc5ea..65c6607cdcd 100644 --- a/website/docs/reference/resource-configs/databricks-configs.md +++ b/website/docs/reference/resource-configs/databricks-configs.md @@ -7,20 +7,41 @@ id: "databricks-configs" When materializing a model as `table`, you may include several optional configs that are specific to the dbt-databricks plugin, in addition to the standard [model configs](/reference/model-configs). -| Option | Description | Required? | Example | -|---------|------------------------------------------------------------------------------------------------------------------------------------|-------------------------|--------------------------| -| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta`| -| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` | -| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | -| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | -| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | + + +| Option | Description | Required? | Example | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|----------------| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | + + + + + + +| Option | Description | Required? | Model Support | Example | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|---------------|----------------| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | + + + ## Incremental models -dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values: +dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of four values: - **`append`** (default): Insert new records without updating or overwriting any existing data. - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the with new data. If no `partition_by` is specified, overwrite the entire table with new data. - - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`; update old records, insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`, updating old records, and inserting new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`replace_where`** (Delta file format only): Match records based on `incremental_predicates`, replacing all records that match the predicates from the existing table with records matching the predicates from the new data. (If no `incremental_predicates` are specified, all new data is inserted, similar to `append`.) Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. @@ -120,7 +141,7 @@ select date_day, count(*) as users -from events +from new_events group by 1 ``` @@ -247,6 +268,96 @@ merge into analytics.merge_incremental as DBT_INTERNAL_DEST
      +### The `replace_where` strategy + +The `replace_where` incremental strategy requires: +- `file_format: delta` +- Databricks Runtime 12.0 and above + +dbt will run an [atomic `replace where` statement](https://docs.databricks.com/en/delta/selective-overwrite.html#arbitrary-selective-overwrite-with-replacewhere) which selectively overwrites data matching one or more `incremental_predicates` specified as a string or array. Only rows matching the predicates will be inserted. If no `incremental_predicates` are specified, dbt will perform an atomic insert, as with `append`. + +:::caution + +`replace_where` inserts data into columns in the order provided, rather than by column name. If you reorder columns and the data is compatible with the existing schema, you may silently insert values into an unexpected column. If the incoming data is incompatible with the existing schema, you will instead receive an error. + +::: + + + + + + +```sql +{{ config( + materialized='incremental', + file_format='delta', + incremental_strategy = 'replace_where' + incremental_predicates = 'user_id >= 10000' # Never replace users with ids < 10000 +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + + + + + + + +```sql +create temporary view replace_where__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + user_id, + max(date_day) as last_seen + + from events + group by 1 + +; + +insert into analytics.replace_where_incremental + replace where user_id >= 10000 + table `replace_where__dbt_tmp` +``` + + + + + + + ## Persisting model descriptions Relation-level docs persistence is supported in dbt v0.17.0. For more @@ -280,3 +391,73 @@ snapshots: ``` + + + +## Materialized views and streaming tables + +Starting with version 1.6.0, the dbt-databricks adapter supports [materialized views](https://docs.databricks.com/en/sql/user/materialized-views.html) and [streaming tables](https://docs.databricks.com/en/sql/load-data-streaming-table.html), as alternatives to incremental tables that are powered by [Delta Live Tables](https://docs.databricks.com/en/delta-live-tables/index.html). +See [What are Delta Live Tables?](https://docs.databricks.com/en/delta-live-tables/index.html#what-are-delta-live-tables-datasets) for more information and use cases. +These features are still in preview, and the support in the dbt-databricks adapter should, for now, be considered _experimental_. +In order to adopt these materialization strategies, you will need a workspace that is enabled for Unity Catalog and serverless SQL Warehouses. + + + +```sql +{{ config( + materialized = 'materialized_view' + ) }} +``` + + + +or + + + +```sql +{{ config( + materialized = 'streaming_table' + ) }} +``` + + + +When dbt detects a pre-existing relation of one of these types, it issues a `REFRESH` [command](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-refresh-full.html). + +### Limitations + +As mentioned above, support for these materializations in the Databricks adapter is still limited. +At this time the following configuration options are not available: + +* Specifying a refresh schedule for these materializations +* Specifying `on_configuration_change` settings. + +Additionally, if you change the model definition of your materialized view or streaming table, you will need to drop the materialization in your warehouse directly before running dbt again; otherwise, you will get a refresh error. + +We plan to address these limitations during the 1.7.x timeframe. + + +## Setting table properties +[Table properties](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-tblproperties.html) can be set with your configuration for tables or views using `tblproperties`: + + + +```sql +{{ config( + tblproperties={ + 'delta.autoOptimize.optimizeWrite' : 'true', + 'delta.autoOptimize.autoCompact' : 'true' + } + ) }} +``` + + + +:::caution + +These properties are sent directly to Databricks without validation in dbt, so be thoughtful with how you use this feature. You will need to do a full refresh of incremental materializations if you change their `tblproperties`. + +::: + +One application of this feature is making `delta` tables compatible with `iceberg` readers using the [Universal Format](https://docs.databricks.com/en/delta/uniform.html). diff --git a/website/docs/reference/resource-configs/delimiter.md b/website/docs/reference/resource-configs/delimiter.md new file mode 100644 index 00000000000..58d6ba8344a --- /dev/null +++ b/website/docs/reference/resource-configs/delimiter.md @@ -0,0 +1,126 @@ +--- +resource_types: [seeds] +datatype: +default_value: "," +--- + +## Definition + +You can use this optional seed configuration to customize how you separate values in a [seed](/docs/build/seeds) by providing the one-character string. + +* The delimiter defaults to a comma when not specified. +* Explicitly set the `delimiter` configuration value if you want seed files to use a different delimiter, such as "|" or ";". + +:::info New in 1.7! + +Delimiter is new functionality available beginning with dbt Core v1.7. + +::: + + +## Usage + +Specify a delimiter in your `dbt_project.yml` file to customize the global separator for all seed values: + + + +```yml +seeds: + : + +delimiter: "|" # default project delimiter for seeds will be "|" + : + +delimiter: "," # delimiter for seeds in seed_subdirectory will be "," +``` + + + + +Or use a custom delimiter to override the values for a specific seed: + + + +```yml +version: 2 + +seeds: + - name: + config: + delimiter: "|" +``` + + + +## Examples +For a project with: + +* `name: jaffle_shop` in the `dbt_project.yml` file +* `seed-paths: ["seeds"]` in the `dbt_project.yml` file + +### Use a custom delimiter to override global values + +You can set a default behavior for all seeds with an exception for one seed, `seed_a`, which uses a comma: + + + +```yml +seeds: + jaffle_shop: + +delimiter: "|" # default delimiter for seeds in jaffle_shop project will be "|" + seed_a: + +delimiter: "," # delimiter for seed_a will be "," +``` + + + +Your corresponding seed files would be formatted like this: + + + +```text +col_a|col_b|col_c +1|2|3 +4|5|6 +... +``` + + + + + +```text +name,id +luna,1 +doug,2 +... +``` + + + +Or you can configure custom behavior for one seed. The `country_codes` uses the ";" delimiter: + + + +```yml +version: 2 + +seeds: + - name: country_codes + config: + delimiter: ";" +``` + + + +The `country_codes` seed file would be formatted like this: + + + +```text +country_code;country_name +US;United States +CA;Canada +GB;United Kingdom +... +``` + + diff --git a/website/docs/reference/resource-configs/docs.md b/website/docs/reference/resource-configs/docs.md index d94b975683d..c5e35dd64f4 100644 --- a/website/docs/reference/resource-configs/docs.md +++ b/website/docs/reference/resource-configs/docs.md @@ -17,10 +17,12 @@ default_value: {show: true} { label: 'Macros', value: 'macros', }, ] }> + + ```yml version: 2 @@ -28,7 +30,7 @@ models: - name: model_name docs: show: true | false - + node_color: "black" ``` @@ -52,9 +54,7 @@ seeds: - name: seed_name docs: show: true | false - ``` - @@ -70,9 +70,7 @@ snapshots: - name: snapshot_name docs: show: true | false - ``` - @@ -89,7 +87,6 @@ analyses: docs: show: true | false ``` - @@ -109,26 +106,20 @@ macros: - name: macro_name docs: show: true | false - ``` - +Also refer to [macro properties](/reference/macro-properties). + ## Definition -The docs field can be used to provide documentation-specific configuration to models. The only currently supported docs attribute is shown, which controls whether or not models are shown in the auto-generated documentation website. +The docs field can be used to provide documentation-specific configuration to models. It supports the doc attribute `show`, which controls whether or not models are shown in the auto-generated documentation website. It also supports `node_color` for some node types. **Note:** hidden models will still appear in the dbt DAG visualization but will be identified as "hidden.” - - -* `v0.16.0`: This property was added - - - ## Default The default value for `show` is `true`. @@ -173,7 +164,7 @@ models: ## Custom node colors -The `docs` attribute now supports `node_color` to customize the node color in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed. +The `docs` attribute now supports `node_color` to customize the display color of some node types in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed. `node_color` hiearchy: @@ -182,7 +173,7 @@ The `docs` attribute now supports `node_color` to customize the node color in th ## Examples -Add custom node colors to models within subdirectories based on hex codes or a plain color name. +Add custom `node_colors` to models that support it within subdirectories based on hex codes or a plain color name. ![Example](../../../../website/static/img/node_color_example.png) diff --git a/website/docs/reference/resource-configs/enabled.md b/website/docs/reference/resource-configs/enabled.md index 03d1598c931..552777c5c81 100644 --- a/website/docs/reference/resource-configs/enabled.md +++ b/website/docs/reference/resource-configs/enabled.md @@ -15,10 +15,22 @@ default_value: true { label: 'Sources', value: 'sources', }, { label: 'Metrics', value: 'metrics', }, { label: 'Exposures', value: 'exposures', }, + { label: 'Semantic models', value: 'semantic models', }, ] }> + + +```yml +models: + [](/reference/resource-configs/resource-path): + +enabled: true | false + +``` + + + ```sql @@ -34,10 +46,15 @@ select ... + + + + + ```yml -models: +seeds: [](/reference/resource-configs/resource-path): +enabled: true | false @@ -47,13 +64,12 @@ models: - - + ```yml -seeds: +snapshots: [](/reference/resource-configs/resource-path): +enabled: true | false @@ -61,10 +77,6 @@ seeds: - - - - ```sql @@ -82,10 +94,14 @@ select ... + + + + ```yml -snapshots: +tests: [](/reference/resource-configs/resource-path): +enabled: true | false @@ -93,10 +109,6 @@ snapshots: - - - - ```sql @@ -124,17 +136,6 @@ select ... - - -```yml -tests: - [](/reference/resource-configs/resource-path): - +enabled: true | false - -``` - - - @@ -150,7 +151,6 @@ sources: - @@ -170,7 +170,6 @@ sources: - @@ -252,10 +251,45 @@ exposures: + + + + +Support for disabling semantic models has been added in dbt Core v1.7 + + + + + + + +```yaml +semantic-models: + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)enabled: true | false +``` + + + + + +```yaml +semantic_models: + - name: [] + [config](/reference/resource-properties/config): + enabled: true | false +``` + + + + + + + ## Definition -An optional configuration for disabling models, seeds, snapshots, and tests. +An optional configuration for enabling or disabling a resource. * Default: true diff --git a/website/docs/reference/resource-configs/grants.md b/website/docs/reference/resource-configs/grants.md index 8ef726788dc..3a65672fa5e 100644 --- a/website/docs/reference/resource-configs/grants.md +++ b/website/docs/reference/resource-configs/grants.md @@ -121,7 +121,7 @@ For example: ```yml models: - +grants: + +grants: # In this case the + is not optional, you must include it for your project to parse. select: ['user_a', 'user_b'] ``` @@ -243,6 +243,7 @@ models: - Databricks automatically enables `grants` on SQL endpoints. For interactive clusters, admins should enable grant functionality using these two setup steps in the Databricks documentation: - [Enable table access control for your workspace](https://docs.databricks.com/administration-guide/access-control/table-acl.html) - [Enable table access control for a cluster](https://docs.databricks.com/security/access-control/table-acls/table-acl.html) +- In order to grant `READ_METADATA` or `USAGE`, use [post-hooks](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook) diff --git a/website/docs/reference/resource-configs/group.md b/website/docs/reference/resource-configs/group.md index dd73d99edff..a71935013c4 100644 --- a/website/docs/reference/resource-configs/group.md +++ b/website/docs/reference/resource-configs/group.md @@ -16,6 +16,7 @@ This functionality is new in v1.5. { label: 'Tests', value: 'tests', }, { label: 'Analyses', value: 'analyses', }, { label: 'Metrics', value: 'metrics', }, + { label: 'Semantic models', value: 'semantic models', }, ] }> @@ -28,28 +29,29 @@ Support for grouping models was added in dbt Core v1.5 - - - + ```yml -version: 2 - models: - - name: model_name - group: finance + + [](resource-path): + +group: GROUP_NAME + ``` + - + ```yml +version: 2 + models: - [](resource-path): - +group: finance -``` + - name: MODEL_NAME + group: GROUP +``` @@ -58,7 +60,7 @@ models: ```sql {{ config( - group='finance' + group='GROUP_NAME' ) }} select ... @@ -67,6 +69,8 @@ select ... + + @@ -79,14 +83,12 @@ Support for grouping seeds was added in dbt Core v1.5 - - ```yml models: [](resource-path): - +group: finance + +group: GROUP_NAME ``` @@ -95,12 +97,14 @@ models: ```yml seeds: - - name: [] - group: finance + - name: [SEED_NAME] + group: GROUP_NAME ``` + + @@ -114,14 +118,12 @@ Support for grouping snapshots was added in dbt Core v1.5 - - ```yml snapshots: [](resource-path): - +group: finance + +group: GROUP_NAME ``` @@ -132,7 +134,7 @@ snapshots: {% snapshot [snapshot_name](snapshot_name) %} {{ config( - group='finance' + group='GROUP_NAME' ) }} select ... @@ -142,6 +144,8 @@ select ... + + @@ -155,14 +159,12 @@ Support for grouping tests was added in dbt Core v1.5 - - ```yml tests: [](resource-path): - +group: finance + +group: GROUP_NAME ``` @@ -177,7 +179,7 @@ version: 2 tests: - : config: - group: finance + group: GROUP_NAME ``` @@ -188,7 +190,7 @@ version: 2 {% test () %} {{ config( - group='finance' + group='GROUP_NAME' ) }} select ... @@ -203,12 +205,14 @@ select ... ```sql {{ config( - group='finance' + group='GROUP_NAME' ) }} ``` + + @@ -219,8 +223,8 @@ select ... version: 2 analyses: - - name: - group: finance + - name: ANALYSIS_NAME + group: GROUP_NAME ``` @@ -238,14 +242,12 @@ Support for grouping metrics was added in dbt Core v1.5 - - ```yaml metrics: [](resource-path): - [+](plus-prefix)group: finance + [+](plus-prefix)group: GROUP_NAME ``` @@ -256,19 +258,61 @@ metrics: version: 2 metrics: - - name: [] - group: finance + - name: [METRIC_NAME] + group: GROUP_NAME + +``` + + + + + + + + + + + + +Support for grouping semantic models has been added in dbt Core v1.7. + + + + + + + +```yaml + +semantic-models: + [](resource-path): + [+](plus-prefix)group: GROUP_NAME ``` + + +```yaml + +semantic_models: + - name: SEMANTIC_MODEL_NAME + group: GROUP_NAME + + +``` + + + + + ## Definition -An optional configuration for grouping models, analysis, snapshots, tests, and metrics. When a resource is grouped, dbt will allow it to reference private models within the same group. +An optional configuration for assigning a group to a resource. When a resource is grouped, dbt will allow it to reference private models within the same group. For more details on reference access between resources in groups, check out [model access](/docs/collaborate/govern/model-access#groups). diff --git a/website/docs/reference/resource-configs/invalidate_hard_deletes.md b/website/docs/reference/resource-configs/invalidate_hard_deletes.md index 3e9f13b738d..ba5b37c5d71 100644 --- a/website/docs/reference/resource-configs/invalidate_hard_deletes.md +++ b/website/docs/reference/resource-configs/invalidate_hard_deletes.md @@ -4,7 +4,6 @@ description: "Invalidate_hard_deletes - Read this in-depth guide to learn about datatype: column_name --- -New in v0.19.0 ```jinja2 diff --git a/website/docs/reference/resource-configs/materialize-configs.md b/website/docs/reference/resource-configs/materialize-configs.md index 1338647a2a6..6976aa84061 100644 --- a/website/docs/reference/resource-configs/materialize-configs.md +++ b/website/docs/reference/resource-configs/materialize-configs.md @@ -8,11 +8,9 @@ id: "materialize-configs" ### Clusters - -- **v1.2.0:** Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). +Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). - The default [cluster](https://materialize.com/docs/overview/key-concepts/#clusters) that is used to maintain materialized views or indexes can be configured in your [profile](/docs/core/connect-data-platform/profiles.yml) using the `cluster` connection parameter. To override the cluster that is used for specific models (or groups of models), use the `cluster` configuration parameter. @@ -45,11 +43,7 @@ Materialize, at its core, is a real-time database that delivers incremental view ### Indexes - - -- **v1.2.0:** Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). - - +Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). Like in any standard relational database, you can use [indexes](https://materialize.com/docs/overview/key-concepts/#indexes) to optimize query performance in Materialize. Improvements can be significant, reducing response times down to single-digit milliseconds. @@ -85,12 +79,6 @@ select ... ### Tests - - -- **v1.1.1:** Provide support for storing the results of a test query in a materialized view using the `store_failures` config. - - - If you set the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures), dbt will create a materialized view for each configured test that can keep track of failures over time. By default, test views are created in a schema suffixed with `dbt_test__audit`. To specify a custom suffix, use the `schema` config. diff --git a/website/docs/reference/resource-configs/meta.md b/website/docs/reference/resource-configs/meta.md index 18cc13ae969..bc0c0c7c041 100644 --- a/website/docs/reference/resource-configs/meta.md +++ b/website/docs/reference/resource-configs/meta.md @@ -4,12 +4,6 @@ datatype: "{}" default_value: {} --- - - -* `v0.21.0`: `meta` is now a config that can be set in `dbt_project.yml` and as a `config` YAML property for some resource types. It is applied hierarchically and merges on a per-key basis. - - - @@ -59,11 +55,13 @@ version: 2 sources: - name: model_name - meta: {} + config: + meta: {} tables: - name: table_name - meta: {} + config: + meta: {} columns: - name: column_name @@ -152,7 +150,6 @@ macros: arguments: - name: argument_name - meta: {} ``` @@ -177,12 +174,40 @@ exposures: + + + + +Support for grouping semantic models was added in dbt Core v1.7 + + + + + + + +```yml +semantic_models: + - name: semantic_model_name + config: + meta: {} + +``` + + + +The `meta` config can also be defined under the `semantic-models` config block in `dbt_project.yml`. See [configs and properties](/reference/configs-and-properties) for details. + + + + + ## Definition The `meta` field can be used to set metadata for a resource. This metadata is compiled into the `manifest.json` file generated by dbt, and is viewable in the auto-generated documentation. -Depending on the resource you're configuring, `meta` may be available within the `config` property, or as a top-level key. (For backwards compatibility, `meta` is always supported as a top-level key, though without the capabilities of config inheritance.) +Depending on the resource you're configuring, `meta` may be available within the `config` property, and/or as a top-level key. (For backwards compatibility, `meta` is often (but not always) supported as a top-level key, though without the capabilities of config inheritance.) ## Examples @@ -252,4 +277,20 @@ seeds: select 1 as id ``` +
      + +### Assign owner in the dbt_project.yml as a config property + + + +```yml +models: + jaffle_shop: + materialized: table + config: + meta: + owner: "@alice" +``` + + diff --git a/website/docs/reference/resource-configs/no-configs.md b/website/docs/reference/resource-configs/no-configs.md index 5a4ba4eaaa2..5eec26917c8 100644 --- a/website/docs/reference/resource-configs/no-configs.md +++ b/website/docs/reference/resource-configs/no-configs.md @@ -8,4 +8,4 @@ If you were guided to this page from a data platform setup article, it most like - Setting up the profile is the only action the end-user needs to take on the data platform, or - The subsequent actions the end-user needs to take are not currently documented -If you'd like to contribute to data platform-specifc configuration information, refer to [Documenting a new adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) \ No newline at end of file +If you'd like to contribute to data platform-specific configuration information, refer to [Documenting a new adapter](/guides/adapter-creation) diff --git a/website/docs/reference/resource-configs/persist_docs.md b/website/docs/reference/resource-configs/persist_docs.md index 6facf3945cb..15b1e0bdb40 100644 --- a/website/docs/reference/resource-configs/persist_docs.md +++ b/website/docs/reference/resource-configs/persist_docs.md @@ -112,13 +112,6 @@ column and relation comments in the database. By default, documentation persistence is disabled, but it can be enabled for specific resources or groups of resources as needed. - - - - Support for this config on Redshift, Postgres, and Snowflake is new in 0.17.0 - - Support for column-level docs persistence is new for all databases in 0.17.0 - - - ## Support The `persist_docs` config is supported on the most widely used dbt adapters: @@ -151,12 +144,6 @@ Some known issues and limitations: - - -- Column names that must be quoted, such as column names containing special characters, will cause runtime errors if column-level `persist_docs` is enabled. This is fixed in v1.2. - - - diff --git a/website/docs/reference/resource-configs/plus-prefix.md b/website/docs/reference/resource-configs/plus-prefix.md index d8c54aa8e70..c1adbc0286a 100644 --- a/website/docs/reference/resource-configs/plus-prefix.md +++ b/website/docs/reference/resource-configs/plus-prefix.md @@ -5,7 +5,7 @@ title: Using the + prefix The `+` prefix is a dbt syntax feature, introduced in dbt v0.17.0, which helps disambiguate between [resource paths](/reference/resource-configs/resource-path) and configs in `dbt_project.yml` files. -It is only compatible with `dbt_project.yml` files that use [`config-version](/reference/project-configs/config-version): 2` +It is not compatible with `dbt_project.yml` files that use [`config-version`](/reference/project-configs/config-version) 1. For example: diff --git a/website/docs/reference/resource-configs/postgres-configs.md b/website/docs/reference/resource-configs/postgres-configs.md index eb9108ad431..97a695ee12e 100644 --- a/website/docs/reference/resource-configs/postgres-configs.md +++ b/website/docs/reference/resource-configs/postgres-configs.md @@ -8,20 +8,25 @@ id: "postgres-configs" In dbt-postgres, the following incremental materialization strategies are supported: + + - `append` (default) -- `merge` - `delete+insert` + -## Performance Optimizations + -### Unlogged +- `append` (default) +- `merge` +- `delete+insert` - + - - **v0.14.1:** Introduced native support for `unlogged` config - +## Performance optimizations + +### Unlogged "Unlogged" tables can be considerably faster than ordinary tables, as they are not written to the write-ahead log nor replicated to read replicas. They are also considerably less safe than ordinary tables. See [Postgres docs](https://www.postgresql.org/docs/current/sql-createtable.html#SQL-CREATETABLE-UNLOGGED) for details. @@ -48,13 +53,7 @@ models: While Postgres works reasonably well for datasets smaller than about 10m rows, database tuning is sometimes required. It's important to create indexes for columns that are commonly used in joins or where clauses. - - - - **v0.20.0:** Introduced native support for `indexes` config - - - -Table models, incremental models, seeds, and snapshots may have a list of `indexes` defined. Each Postgres index can have three components: +Table models, incremental models, seeds, snapshots, and materialized views may have a list of `indexes` defined. Each Postgres index can have three components: - `columns` (list, required): one or more columns on which the index is defined - `unique` (boolean, optional): whether the index should be [declared unique](https://www.postgresql.org/docs/9.4/indexes-unique.html) - `type` (string, optional): a supported [index type](https://www.postgresql.org/docs/current/indexes-types.html) (B-tree, Hash, GIN, etc) @@ -107,45 +106,35 @@ models: -## Materialized view +## Materialized views -The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html) and refreshes them for every subsequent `dbt run` you execute. For more information, see [Refresh Materialized Views](https://www.postgresql.org/docs/15/sql-refreshmaterializedview.html) in the Postgres docs. +The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html). +Indexes are the only configuration that is specific to `dbt-postgres`. +The remaining configuration follows the general [materialized view](/docs/build/materializations#materialized-view) configuration. +There are also some limitations that we hope to address in the next version. -Materialized views support the optional configuration `on_configuration_change` with the following values: -- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The following index action can be applied without the need to rebuild the materialized view: - - Added - - Dropped - - Updated -- `skip` — allows runs to continue while also providing a warning that the model was skipped -- `fail` — forces runs to fail if a change is detected in a materialized view +### Monitored configuration changes -You can create a materialized view by editing _one_ of these files: -- the SQL file for your model -- the `dbt_project.yml` configuration file +The settings below are monitored for changes applicable to `on_configuration_change`. -The following examples create a materialized view: +#### Indexes - +Index changes (`CREATE`, `DROP`) can be applied without the need to rebuild the materialized view. +This differs from a table model, where the table needs to be dropped and re-created to update the indexes. +If the `indexes` portion of the `config` block is updated, the changes will be detected and applied +directly to the materialized view in place. -```sql -{{ - config( - materialized = 'materialized_view', - on_configuration_change = 'apply', - ) -}} -``` +### Limitations - +#### Changing materialization to and from "materialized_view" +Swapping an already materialized model to a materialized view, and vice versa, is not supported. +The workaround is to manually drop the existing materialization in the data warehouse prior to calling `dbt run`. +Running with `--full-refresh` flag will not work to drop the existing table or view and create the materialized view (and vice versa). +This would only need to be done once as the existing object would then be a materialized view. - - -```yaml -models: - path: - materialized: materialized_view -``` - +For example,`my_model`, has already been materialized as a table in the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="materialized_view"`, they will get an error. +The solution is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. diff --git a/website/docs/reference/resource-configs/pre-hook-post-hook.md b/website/docs/reference/resource-configs/pre-hook-post-hook.md index 1660c50049b..de652bff088 100644 --- a/website/docs/reference/resource-configs/pre-hook-post-hook.md +++ b/website/docs/reference/resource-configs/pre-hook-post-hook.md @@ -115,13 +115,6 @@ Pre- and post-hooks can also call macros that return SQL statements. If your mac dbt aims to provide all the boilerplate SQL you need (DDL, DML, and DCL) via out-of-the-box functionality, which you can configure quickly and concisely. In some cases, there may be SQL that you want or need to run, specific to functionality in your data platform, which dbt does not (yet) offer as a built-in feature. In those cases, you can write the exact SQL you need, using dbt's compilation context, and pass it into a `pre-` or `post-` hook to run before or after your model, seed, or snapshot. - - -* `v0.12.2`: The `post_hook` alias for config blocks was introduced. Prior to this, users needed to use the alternative config syntax to apply pre- and post-hooks. - - - - ## Examples @@ -167,69 +160,6 @@ See: [Apache Spark docs on `ANALYZE TABLE`](https://spark.apache.org/docs/latest - - -### Grant privileges on a model - - - -```yml - -models: - +post-hook: "grant select on {{ this }} to group reporter" - -``` - - - -### Grant multiple privileges on a model - - - -```yml - -models: - +post-hook: - - "grant select on {{ this }} to group reporter" - - "grant select on {{ this }} to group transformer" - -``` - - - -### Call a macro to grant privileges on a model - - - -```yml - -models: - +post-hook: "{{ grant_select(this) }}" - -``` - - - - -### Grant privileges on a directory of models - - - -```yml - -models: - jaffle_shop: # this is the project name - marts: - marketing: - # this will be applied to all models in marts/marketing/ - +post-hook: "{{ grant_select(this) }}" - -``` - - - - - ### Additional examples We've compiled some more in-depth examples [here](/docs/build/hooks-operations#additional-examples). @@ -245,13 +175,17 @@ If multiple instances of any hooks are defined, dbt will run each hook using the ### Transaction behavior -If you're using an adapter that makes use of transactions (namely Postgres or Redshift), it's worth noting that by default hooks are executed inside of the same transaction as your model being created. +If you're using an adapter that uses transactions (namely Postgres or Redshift), it's worth noting that by default hooks are executed inside of the same transaction as your model being created. There may be occasions where you need to run these hooks _outside_ of a transaction, for example: -* You want to run a `VACUUM` in a `post-hook`, however this cannot be executed within a transaction ([Redshift docs](https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html#r_VACUUM_usage_notes)) -* You want to insert a record into an audit at the start of a run, and do not want that statement rolled back if the model creation fails. +* You want to run a `VACUUM` in a `post-hook`, however, this cannot be executed within a transaction ([Redshift docs](https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html#r_VACUUM_usage_notes)) +* You want to insert a record into an audit at the start of a run and do not want that statement rolled back if the model creation fails. + +To achieve this behavior, you can use one of the following syntaxes: + - Important note: Do not use this syntax if you are using a database where dbt does not support transactions. This includes databases like Snowflake, BigQuery, and Spark or Databricks. -To achieve this, you can use one of the following syntaxes. (Note: You should NOT use this syntax if using a database where dbt does not use transactions by default, including Snowflake, BigQuery, and Spark/Databricks.) + + #### Config block: use the `before_begin` and `after_commit` helper macros @@ -270,6 +204,9 @@ select ... ```
      + + + #### Config block: use a dictionary @@ -294,6 +231,10 @@ select ... + + + + #### `dbt_project.yml`: Use a dictionary @@ -312,3 +253,5 @@ models: ``` + + diff --git a/website/docs/reference/resource-configs/redshift-configs.md b/website/docs/reference/resource-configs/redshift-configs.md index a0ebf7e88df..9bd127a1e1a 100644 --- a/website/docs/reference/resource-configs/redshift-configs.md +++ b/website/docs/reference/resource-configs/redshift-configs.md @@ -14,17 +14,28 @@ To-do: In dbt-redshift, the following incremental materialization strategies are supported: + + +- `append` (default) +- `delete+insert` + + + + + - `append` (default) - `merge` - `delete+insert` -All of these strategies are inheirited via from dbt-postgres. + + +All of these strategies are inherited from dbt-postgres. ## Performance optimizations ### Using sortkey and distkey -Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect for models set to `view` or `ephemeral` models. +Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect on models set to `view` or `ephemeral` models. - `dist` can have a setting of `all`, `even`, `auto`, or the name of a key. - `sort` accepts a list of sort keys, for example: `['timestamp', 'userid']`. dbt will build the sort key in the same order the fields are supplied. @@ -64,7 +75,7 @@ For more information on distkeys and sortkeys, view Amazon's docs: - [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing a Data Distribution Style](https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html) - [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing Sort Keys](https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html) -## Late Binding Views +## Late binding views Redshift supports views unbound from their dependencies, or [late binding views](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_VIEW.html#late-binding-views). This DDL option "unbinds" a view from the data it selects from. In practice, this means that if upstream views or tables are dropped with a cascade qualifier, the late-binding view does not get dropped as well. @@ -98,42 +109,51 @@ models: -## Materialized view +## Materialized views -The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html) and refreshes them for every subsequent `dbt run` that you execute. For more information, see [Refresh Materialized Views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) in the Redshift docs. +The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html). +Redshift-specific configuration includes the typical `dist`, `sort_type`, `sort`, and `backup`. +For materialized views, there is also the `auto_refresh` setting, which allows Redshift to [automatically refresh](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) the materialized view for you. +The remaining configuration follows the general [materialized view](/docs/build/materializations#Materialized-View) configuration. +There are also some limitations that we hope to address in the next version. -Materialized views support the optional configuration `on_configuration_change` with the following values: -- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The `auto_refresh` action can applied without the need to rebuild the materialized view. -- `skip` — allows runs to continue while also providing a warning that the model was skipped -- `fail` — forces runs to fail if a change is detected in a materialized view +### Monitored configuration changes -You can create a materialized view by editing _one_ of these files: -- the SQL file for your model -- the `dbt_project.yml` configuration file +The settings below are monitored for changes applicable to `on_configuration_change`. -The following examples create a materialized view: +#### Dist - +Changes to `dist` will result in a full refresh of the existing materialized view (applied at the time of the next `dbt run` of the model). Redshift requires a materialized view to be +dropped and recreated to apply a change to the `distkey` or `diststyle`. -```sql -{{ - config( - materialized = 'materialized_view', - on_configuration_change = 'apply', - ) -}} -``` +#### Sort type, sort - +Changes to `sort_type` or `sort` will result in a full refresh. Redshift requires a materialized +view to be dropped and recreated to apply a change to the `sortkey` or `sortstyle`. +#### Backup - +Changes to `backup` will result in a full refresh. Redshift requires a materialized +view to be dropped and recreated to apply a change to the `backup` setting. -```yaml -models: - path: - materialized: materialized_view -``` - +#### Auto refresh + +The `auto_refresh` setting can be updated via an `ALTER` statement. This setting effectively toggles +automatic refreshes on or off. The default setting for this config is off (`False`). If this +is the only configuration change for the materialized view, dbt will choose to apply +an `ALTER` statement instead of issuing a full refresh, + +### Limitations + +#### Changing materialization from "materialized_view" to "table" or "view" + +Swapping a materialized view to a table or view is not supported. +You must manually drop the existing materialized view in the data warehouse prior to calling `dbt run`. +Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case. +This would only need to be done once as the existing object would then be a materialized view. + +For example, assume that a materialized view, `my_mv.sql`, has already been materialized to the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="table"`, they will get an error. +The workaround is to execute `DROP MATERIALIZED VIEW my_mv CASCADE` on the data warehouse before trying the model again. diff --git a/website/docs/reference/resource-configs/resource-path.md b/website/docs/reference/resource-configs/resource-path.md index 258b83dcd57..20406f26f2a 100644 --- a/website/docs/reference/resource-configs/resource-path.md +++ b/website/docs/reference/resource-configs/resource-path.md @@ -1,11 +1,28 @@ -The `` nomenclature is used in this documentation when documenting how to configure a model, seed, or snapshot, from your `dbt_project.yml` file. It represents the nested dictionary keys that provide the path to either a directory of models, or a single model. +--- +title: Resource path +description: "Learn how to use resource paths to configure resource types in dbt." +id: resource-path +sidebar_label: "About resource paths" +--- + +The `` nomenclature is used in this documentation when documenting how to configure resource types like models, seeds, snapshots, tests, sources, and others, from your `dbt_project.yml` file. + +It represents the nested dictionary keys that provide the path to a directory of that resource type, or a single instance of that resource type by name. + +```yml +resource_type: + project_name: + directory_name: + subdirectory_name: + instance_of_resource_type (by name): + ... +``` ## Example -:::info -This example is for models, but the same concepts apply for seeds and snapshots. +The following examples are mostly for models and a source, but the same concepts apply for seeds, snapshots, tests, sources, and other resource types. -::: +### Apply config to all models To apply a configuration to all models, do not use a ``: @@ -18,6 +35,8 @@ models: +### Apply config to all models in your project + To apply a configuration to all models in _your_ project only, use your [project name](/reference/project-configs/name) as the ``: @@ -32,6 +51,8 @@ models: +### Apply config to all models in a subdirectory + To apply a configuration to all models in a subdirectory of your project, e.g. `staging`, nest the directory under the project name: @@ -57,6 +78,8 @@ In the following project, this would apply to models in the `staging/` directory ``` +### Apply config to all models in one model + To apply a configuration to one model, nest the full path under the project name. For a model at `/staging/stripe/payments.sql`, this would look like: @@ -92,3 +115,19 @@ In the following project, this would only apply to the `payments` model:    └── payments.sql ``` +### Apply config to a source nested in a subfolder + +To disable a source table nested in a YAML file in a subfolder, you will need to supply the subfolder(s) within the path to that YAML file, as well as the source name and the table name in the `dbt_project.yml` file.

      + The following example shows how to disable a source table nested in a YAML file in a subfolder: + + + + ```yaml + sources: + your_project_name: + subdirectory_name: + source_name: + source_table_name: + +enabled: false + ``` + diff --git a/website/docs/reference/resource-configs/schema.md b/website/docs/reference/resource-configs/schema.md index c976bf6502a..3852ee4e639 100644 --- a/website/docs/reference/resource-configs/schema.md +++ b/website/docs/reference/resource-configs/schema.md @@ -1,14 +1,70 @@ --- sidebar_label: "schema" resource_types: [models, seeds, tests] -description: "Schema - Read this in-depth guide to learn about configurations in dbt." +description: "Override the default schema when dbt creates resources in your data platform." datatype: string --- -:::caution Heads up! -This is a work in progress document. While this configuration applies to multiple resource types, the documentation has only been written for seeds. + + -::: +Specify a custom schema for a group of models in your `dbt_project.yml` file or a [config block](/reference/resource-configs/schema#models). + +For example, if you have a group of marketing-related models and you want to place them in a separate schema called `marketing`, you can configure it like this: + + + +```yml +models: + your_project: + marketing: # Grouping or folder for set of models + +schema: marketing +``` + + +This would result in the generated relations for these models being located in the `marketing` schema, so the full relation names would be `analytics.marketing.model_name`. + + + + +Configure a custom schema in your `dbt_project.yml` file. + +For example, if you have a seed that should be placed in a separate schema called `mappings`, you can configure it like this: + + + +```yml +seeds: + your_project: + product_mappings: + +schema: mappings +``` + +This would result in the generated relation being located in the `mappings` schema, so the full relation name would be `analytics.mappings.product_mappings`. + + + + + +Customize the schema for storing test results in your `dbt_project.yml` file. + +For example, to save test results in a specific schema, you can configure it like this: + + + + +```yml +tests: + +store_failures: true + +schema: test_results +``` + +This would result in the test results being stored in the `test_results` schema. + + + + +Refer to [Usage](#usage) for more examples. ## Definition Optionally specify a custom schema for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a schema for a [snapshot](/docs/build/snapshots), use the [`target_schema` config](/reference/resource-configs/target_schema)). diff --git a/website/docs/reference/resource-configs/severity.md b/website/docs/reference/resource-configs/severity.md index c89c6db0716..25bab9647d6 100644 --- a/website/docs/reference/resource-configs/severity.md +++ b/website/docs/reference/resource-configs/severity.md @@ -6,14 +6,6 @@ resource_types: [tests] datatype: string --- - - -* `v0.14.0`: Introduced `severity` config -* `v0.20.0`: Introduced `error_if` + `warn_if` configs. Enabled configuration of tests from `dbt_project.yml` -* `v0.21.0`: Introduced `config` property for tests - - - Tests return a number of failures—most often, this is the count of rows returned by the test query, but it could be a [custom calculation](/reference/resource-configs/fail_calc). Generally, if the number of failures is nonzero, the test returns an error. This makes sense, as test queries are designed to return all the rows you _don't_ want: duplicate records, null values, etc. It's possible to configure tests to return warnings instead of errors, or to make the test status conditional on the number of failures returned. Maybe 1 duplicate record can count as a warning, but 10 duplicate records should count as an error. diff --git a/website/docs/reference/resource-configs/singlestore-configs.md b/website/docs/reference/resource-configs/singlestore-configs.md index f503779f0fc..0c93d557a8b 100644 --- a/website/docs/reference/resource-configs/singlestore-configs.md +++ b/website/docs/reference/resource-configs/singlestore-configs.md @@ -3,13 +3,6 @@ title: "SingleStore configurations" id: "singlestore-configs" --- - - - - - **v1.1.2:** Added support for for `storage_type`, `indexes`, `primary_key`, `sort_key`, `shard_key`, `unique_table_key`, `charset`, `collation` options for creating SingleStore tables. - - - ## Performance Optimizations [SingleStore Physical Database Schema Design documentation](https://docs.singlestore.com/managed-service/en/create-a-database/physical-database-schema-design/concepts-of-physical-database-schema-design.html) is helpful if you want to use specific options (that are described below) in your dbt project. diff --git a/website/docs/reference/resource-configs/snowflake-configs.md b/website/docs/reference/resource-configs/snowflake-configs.md index 42ee3635089..30c7966ec68 100644 --- a/website/docs/reference/resource-configs/snowflake-configs.md +++ b/website/docs/reference/resource-configs/snowflake-configs.md @@ -77,7 +77,7 @@ select ... ``` -In this example, you can set up a query tag to be applied to every query with the model's name. +In this example, you can set up a query tag to be applied to every query with the model's name. ```sql @@ -301,7 +301,7 @@ models: -## Temporary Tables +## Temporary tables Beginning in dbt version 1.3, incremental table merges for Snowflake prefer to utilize a `view` rather than a `temporary table`. The reasoning was to avoid the database write step that a temporary table would initiate and save compile time. @@ -341,3 +341,99 @@ In the configuration format for the model SQL file:
      + + + +## Dynamic tables + +The Snowflake adapter supports [dynamic tables](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table). +This materialization is specific to Snowflake, which means that any model configuration that +would normally come along for the ride from `dbt-core` (e.g. as with a `view`) may not be available +for dynamic tables. This gap will decrease in future patches and versions. +While this materialization is specific to Snowflake, it very much follows the implementation +of [materialized views](/docs/build/materializations#Materialized-View). +In particular, dynamic tables have access to the `on_configuration_change` setting. +There are also some limitations that we hope to address in the next version. + +### Parameters + +Dynamic tables in `dbt-snowflake` require the following parameters: +- `target_lag` +- `snowflake_warehouse` +- `on_configuration_change` + +To learn more about each parameter and what values it can take, see +the Snowflake docs page: [`CREATE DYNAMIC TABLE: Parameters`](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table). + +### Usage + +You can create a dynamic table by editing _one_ of these files: + +- the SQL file for your model +- the `dbt_project.yml` configuration file + +The following examples create a dynamic table: + + + +```sql +{{ config( + materialized = 'dynamic_table', + snowflake_warehouse = 'snowflake_warehouse', + target_lag = '10 minutes', +) }} +``` + + + + + +```yaml +models: + path: + materialized: dynamic_table + snowflake_warehouse: snowflake_warehouse + target_lag: '10 minutes' +``` + + + +### Monitored configuration changes + +The settings below are monitored for changes applicable to `on_configuration_change`. + +#### Target lag + +Changes to `target_lag` can be applied by running an `ALTER` statement. Refreshing is essentially +always on for dynamic tables; this setting changes how frequently the dynamic table is updated. + +#### Warehouse + +Changes to `snowflake_warehouse` can be applied via an `ALTER` statement. + +### Limitations + +#### Changing materialization to and from "dynamic_table" + +Swapping an already materialized model to be a dynamic table and vice versa. +The workaround is manually dropping the existing materialization in the data warehouse prior to calling `dbt run`. +Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case. +This would only need to be done once as the existing object would then be a dynamic table. + +For example, assume for the example model below, `my_model`, has already been materialized to the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="dynamic_table"`, they will get an error. +The workaround is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. + + + +```yaml + +{{ config( + materialized="table" # or any model type eg view, incremental +) }} + +``` + + + + diff --git a/website/docs/reference/resource-configs/spark-configs.md b/website/docs/reference/resource-configs/spark-configs.md index 95a853107f6..ce3b317f0f1 100644 --- a/website/docs/reference/resource-configs/spark-configs.md +++ b/website/docs/reference/resource-configs/spark-configs.md @@ -29,12 +29,6 @@ When materializing a model as `table`, you may include several optional configs ## Incremental models - - - - `dbt-spark==0.19.0`: Added the `append` strategy as default for all platforms, file types, and connection methods. - - - dbt seeks to offer useful, intuitive modeling abstractions by means of its built-in configurations and materializations. Because there is so much variance between Apache Spark clusters out in the world—not to mention the powerful features offered to Databricks users by the Delta file format and custom runtime—making sense of all the available options is an undertaking in its own right. Alternatively, you can use Apache Iceberg or Apache Hudi file format with Apache Spark runtime for building incremental models. @@ -192,13 +186,6 @@ insert overwrite table analytics.spark_incremental ### The `merge` strategy - - - - `dbt-spark==0.15.3`: Introduced `merge` incremental strategy - - - - **Usage notes:** The `merge` incremental strategy requires: - `file_format: delta, iceberg or hudi` - Databricks Runtime 5.1 and above for delta file format @@ -294,12 +281,6 @@ or `show table extended in [database] like '*'`. ## Always `schema`, never `database` - - - - `dbt-spark==0.17.0` ended use of `database` in all cases. - - - Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands `database` to exist at a higher level than `schema`. As such, you should _never_ use or set `database` as a node config or in the target profile when running dbt-spark. diff --git a/website/docs/reference/resource-configs/starrocks-configs.md b/website/docs/reference/resource-configs/starrocks-configs.md new file mode 100644 index 00000000000..093534515c6 --- /dev/null +++ b/website/docs/reference/resource-configs/starrocks-configs.md @@ -0,0 +1,116 @@ +--- +title: "Starrocks configurations" +id: "starrocks-configs" +description: "Starrocks Configurations - Read this in-depth guide to learn about configurations in dbt." +--- + +## Model Configuration + +A dbt model can be configured using the following syntax: + + + + + + + +```yaml +models: + : + materialized: table // table or view or materialized_view + keys: ['id', 'name', 'some_date'] + table_type: 'PRIMARY' // PRIMARY or DUPLICATE or UNIQUE + distributed_by: ['id'] + buckets: 3 // default 10 + partition_by: ['some_date'] + partition_by_init: ["PARTITION p1 VALUES [('1971-01-01 00:00:00'), ('1991-01-01 00:00:00')),PARTITION p1972 VALUES [('1991-01-01 00:00:00'), ('1999-01-01 00:00:00'))"] + properties: [{"replication_num":"1", "in_memory": "true"}] + refresh_method: 'async' // only for materialized view default manual +``` + + + + + + + +```yaml +models: + - name: + config: + materialized: table // table or view or materialized_view + keys: ['id', 'name', 'some_date'] + table_type: 'PRIMARY' // PRIMARY or DUPLICATE or UNIQUE + distributed_by: ['id'] + buckets: 3 // default 10 + partition_by: ['some_date'] + partition_by_init: ["PARTITION p1 VALUES [('1971-01-01 00:00:00'), ('1991-01-01 00:00:00')),PARTITION p1972 VALUES [('1991-01-01 00:00:00'), ('1999-01-01 00:00:00'))"] + properties: [{"replication_num":"1", "in_memory": "true"}] + refresh_method: 'async' // only for materialized view default manual +``` + + + + + + + +```jinja +{{ config( + materialized = 'table', + keys=['id', 'name', 'some_date'], + table_type='PRIMARY', + distributed_by=['id'], + buckets=3, + partition_by=['some_date'], + .... +) }} +``` + + + + +### Configuration Description + +| Option | Description | +|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `materialized` | How the model will be materialized into Starrocks. Supports view, table, incremental, ephemeral, and materialized_view. | +| `keys` | Which columns serve as keys. | +| `table_type` | Table type, supported are PRIMARY or DUPLICATE or UNIQUE. | +| `distributed_by` | Specifies the column of data distribution. If not specified, it defaults to random. | +| `buckets` | The bucket number in one partition. If not specified, it will be automatically inferred. | +| `partition_by` | The partition column list. | +| `partition_by_init` | The partition rule or some real partitions item. | +| `properties` | The table properties configuration of Starrocks. ([Starrocks table properties](https://docs.starrocks.io/en-us/latest/sql-reference/sql-statements/data-definition/CREATE_TABLE#properties)) | +| `refresh_method` | How to refresh materialized views. | + +## Read From Catalog +First you need to add this catalog to starrocks. The following is an example of hive. + +```sql +CREATE EXTERNAL CATALOG `hive_catalog` +PROPERTIES ( + "hive.metastore.uris" = "thrift://127.0.0.1:8087", + "type"="hive" +); +``` +How to add other types of catalogs can be found in the documentation. [Catalog Overview](https://docs.starrocks.io/en-us/latest/data_source/catalog/catalog_overview) Then write the sources.yaml file. +```yaml +sources: + - name: external_example + schema: hive_catalog.hive_db + tables: + - name: hive_table_name +``` +Finally, you might use below marco quote +```jinja +{{ source('external_example', 'hive_table_name') }} +``` \ No newline at end of file diff --git a/website/docs/reference/resource-configs/store_failures.md b/website/docs/reference/resource-configs/store_failures.md index 62ae33ba713..2c596d1cf3e 100644 --- a/website/docs/reference/resource-configs/store_failures.md +++ b/website/docs/reference/resource-configs/store_failures.md @@ -3,21 +3,14 @@ resource_types: [tests] datatype: boolean --- - - -* `v0.20.0`: Introduced `store_failures` config and functionality -* `v0.21.0`: Introduced `config` property for tests - - - -The configured test(s) will store their failures when `dbt test --store-failures` is invoked. +The configured test(s) will store their failures when `dbt test --store-failures` is invoked. If you set this configuration as `false` but [`store_failures_as`](/reference/resource-configs/store_failures_as) is configured, it will be overriden. ## Description Optionally set a test to always or never store its failures in the database. - If specified as `true` or `false`, the `store_failures` config will take precedence over the presence or absence of the `--store-failures` flag. - If the `store_failures` config is `none` or omitted, the resource will use the value of the `--store-failures` flag. -- When true, `store_failures` save all the record(s) that failed the test only if [limit](/reference/resource-configs/limit) is not set or if there are fewer records than the limit. `store_failures` are saved in a new table with the name of the test. By default, `store_failures` use a schema named `dbt_test__audit`, but, you can configure the schema to a different value. +- When true, `store_failures` save all the record(s) that failed the test only if [limit](/reference/resource-configs/limit) is not set or if there are fewer records than the limit. `store_failures` are saved in a new table with the name of the test. By default, `store_failures` use a schema named `dbt_test__audit`, but, you can [configure](/reference/resource-configs/schema#tests) the schema to a different value. This logic is encoded in the [`should_store_failures()`](https://github.com/dbt-labs/dbt-core/blob/98c015b7754779793e44e056905614296c6e4527/core/dbt/include/global_project/macros/materializations/helpers.sql#L77) macro. diff --git a/website/docs/reference/resource-configs/store_failures_as.md b/website/docs/reference/resource-configs/store_failures_as.md new file mode 100644 index 00000000000..a9149360089 --- /dev/null +++ b/website/docs/reference/resource-configs/store_failures_as.md @@ -0,0 +1,76 @@ +--- +resource_types: [tests] +id: "store_failures_as" +--- + +For the `test` resource type, `store_failures_as` is an optional config that specifies how test failures should be stored in the database. If [`store_failures`](/reference/resource-configs/store_failures) is also configured, `store_failures_as` takes precedence. + +The three supported values are: + +- `ephemeral` — nothing stored in the database (default) +- `table` — test failures stored as a database table +- `view` — test failures stored as a database view + +You can configure it in all the same places as `store_failures`, including singular tests (.sql files), generic tests (.yml files), and dbt_project.yml. + +### Examples + +#### Singular test + +[Singular test](https://docs.getdbt.com/docs/build/tests#singular-tests) in `tests/singular/check_something.sql` file + +```sql +{{ config(store_failures_as="table") }} + +-- custom singular test +select 1 as id +where 1=0 +``` + +#### Generic test + +[Generic tests](https://docs.getdbt.com/docs/build/tests#generic-tests) in `models/_models.yml` file + +```yaml +models: + - name: my_model + columns: + - name: id + tests: + - not_null: + config: + store_failures_as: view + - unique: + config: + store_failures_as: ephemeral +``` + +#### Project level + +Config in `dbt_project.yml` + +```yaml +name: "my_project" +version: "1.0.0" +config-version: 2 +profile: "sandcastle" + +tests: + my_project: + +store_failures_as: table + my_subfolder_1: + +store_failures_as: view + my_subfolder_2: + +store_failures_as: ephemeral +``` + +### "Clobbering" configs + +As with most other configurations, `store_failures_as` is "clobbered" when applied hierarchically. Whenever a more specific value is available, it will completely replace the less specific value. + +Additional resources: + +- [Test configurations](/reference/test-configs#related-documentation) +- [Test-specific configurations](/reference/test-configs#test-specific-configurations) +- [Configuring directories of models in dbt_project.yml](/reference/model-configs#configuring-directories-of-models-in-dbt_projectyml) +- [Config inheritance](/reference/configs-and-properties#config-inheritance) \ No newline at end of file diff --git a/website/docs/reference/resource-configs/teradata-configs.md b/website/docs/reference/resource-configs/teradata-configs.md index f0f4f1a6f3e..12a8929429d 100644 --- a/website/docs/reference/resource-configs/teradata-configs.md +++ b/website/docs/reference/resource-configs/teradata-configs.md @@ -35,14 +35,21 @@ id: "teradata-configs" ### * `table_kind` - define the table kind. Legal values are `MULTISET` (default for ANSI transaction mode required by `dbt-teradata`) and `SET`, e.g.: - ```yaml - {{ - config( - materialized="table", - table_kind="SET" - ) - }} - ``` + * in sql materialization definition file: + ```yaml + {{ + config( + materialized="table", + table_kind="SET" + ) + }} + ``` + * in seed configuration: + ```yaml + seeds: + : + table_kind: "SET" + ``` For details, see [CREATE TABLE documentation](https://docs.teradata.com/r/76g1CuvvQlYBjb2WPIuk3g/B6Js16DRQVwPDjgJ8rz7hg). * `table_option` - defines table options. The config supports multiple statements. The definition below uses the Teradata syntax definition to explain what statements are allowed. Square brackets `[]` denote optional parameters. The pipe symbol `|` separates statements. Use commas to combine multiple statements as shown in the examples below: ``` @@ -87,37 +94,51 @@ id: "teradata-configs" ``` Examples: - - :::info Separators between statements - Note the commas that separate statements in `table_option` config. - ::: - - ```yaml - {{ - config( - materialized="table", - table_option="NO FALLBACK" - ) - }} - ``` - ```yaml - {{ - config( - materialized="table", - table_option="NO FALLBACK, NO JOURNAL" - ) - }} - ``` - ```yaml - {{ - config( - materialized="table", - table_option="NO FALLBACK, NO JOURNAL, CHECKSUM = ON, - NO MERGEBLOCKRATIO, - WITH CONCURRENT ISOLATED LOADING FOR ALL" - ) - }} - ``` + * in sql materialization definition file: + ```yaml + {{ + config( + materialized="table", + table_option="NO FALLBACK" + ) + }} + ``` + ```yaml + {{ + config( + materialized="table", + table_option="NO FALLBACK, NO JOURNAL" + ) + }} + ``` + ```yaml + {{ + config( + materialized="table", + table_option="NO FALLBACK, NO JOURNAL, CHECKSUM = ON, + NO MERGEBLOCKRATIO, + WITH CONCURRENT ISOLATED LOADING FOR ALL" + ) + }} + ``` + * in seed configuration: + ```yaml + seeds: + : + table_option:"NO FALLBACK" + ``` + ```yaml + seeds: + : + table_option:"NO FALLBACK, NO JOURNAL" + ``` + ```yaml + seeds: + : + table_option: "NO FALLBACK, NO JOURNAL, CHECKSUM = ON, + NO MERGEBLOCKRATIO, + WITH CONCURRENT ISOLATED LOADING FOR ALL" + ``` For details, see [CREATE TABLE documentation](https://docs.teradata.com/r/76g1CuvvQlYBjb2WPIuk3g/B6Js16DRQVwPDjgJ8rz7hg). @@ -160,46 +181,67 @@ id: "teradata-configs" ``` Examples: - - :::info Separators between statements - Note, unlike with `table_option` statements, there are no commas between statements in `index` config. - ::: - - ```yaml - {{ - config( - materialized="table", - index="UNIQUE PRIMARY INDEX ( GlobalID )" - ) - }} - ``` - - ```yaml - {{ - config( - materialized="table", - index="PRIMARY INDEX(id) - PARTITION BY RANGE_N(create_date - BETWEEN DATE '2020-01-01' - AND DATE '2021-01-01' - EACH INTERVAL '1' MONTH)" - ) - }} - ``` - - ```yaml - {{ - config( - materialized="table", - index="PRIMARY INDEX(id) - PARTITION BY RANGE_N(create_date - BETWEEN DATE '2020-01-01' - AND DATE '2021-01-01' - EACH INTERVAL '1' MONTH) - INDEX index_attrA (attrA) WITH LOAD IDENTITY" - ) - }} - ``` + * in sql materialization definition file: + ```yaml + {{ + config( + materialized="table", + index="UNIQUE PRIMARY INDEX ( GlobalID )" + ) + }} + ``` + > :information_source: Note, unlike in `table_option`, there are no commas between index statements! + ```yaml + {{ + config( + materialized="table", + index="PRIMARY INDEX(id) + PARTITION BY RANGE_N(create_date + BETWEEN DATE '2020-01-01' + AND DATE '2021-01-01' + EACH INTERVAL '1' MONTH)" + ) + }} + ``` + ```yaml + {{ + config( + materialized="table", + index="PRIMARY INDEX(id) + PARTITION BY RANGE_N(create_date + BETWEEN DATE '2020-01-01' + AND DATE '2021-01-01' + EACH INTERVAL '1' MONTH) + INDEX index_attrA (attrA) WITH LOAD IDENTITY" + ) + }} + ``` + * in seed configuration: + ```yaml + seeds: + : + index: "UNIQUE PRIMARY INDEX ( GlobalID )" + ``` + > :information_source: Note, unlike in `table_option`, there are no commas between index statements! + ```yaml + seeds: + : + index: "PRIMARY INDEX(id) + PARTITION BY RANGE_N(create_date + BETWEEN DATE '2020-01-01' + AND DATE '2021-01-01' + EACH INTERVAL '1' MONTH)" + ``` + ```yaml + seeds: + : + index: "PRIMARY INDEX(id) + PARTITION BY RANGE_N(create_date + BETWEEN DATE '2020-01-01' + AND DATE '2021-01-01' + EACH INTERVAL '1' MONTH) + INDEX index_attrA (attrA) WITH LOAD IDENTITY" + ``` ## Seeds :::info Using seeds to load raw data @@ -220,6 +262,35 @@ Loading CSVs using dbt's seed functionality is not performant for large files. C +use_fastload: true ``` +#### Grants + +Grants are supported in dbt-teradata adapter with release version 1.2.0 and above. You can use grants to manage access to the datasets you're producing with dbt. To implement these permissions, define grants as resource configs on each model, seed, or snapshot. Define the default grants that apply to the entire project in your `dbt_project.yml`, and define model-specific grants within each model's SQL or YAML file. + +for e.g. : + models/schema.yml + ```yaml + models: + - name: model_name + config: + grants: + select: ['user_a', 'user_b'] + ``` + +Another e.g. for adding multiple grants: + + ```yaml + models: + - name: model_name + config: + materialized: table + grants: + select: ["user_b"] + insert: ["user_c"] + ``` +> :information_source: `copy_grants` is not supported in Teradata. + +More on Grants can be found at https://docs.getdbt.com/reference/resource-configs/grants + ## Common Teradata-specific tasks * *collect statistics* - when a table is created or modified significantly, there might be a need to tell Teradata to collect statistics for the optimizer. It can be done using `COLLECT STATISTICS` command. You can perform this step using dbt's `post-hooks`, e.g.: diff --git a/website/docs/reference/resource-configs/upsolver-configs.md b/website/docs/reference/resource-configs/upsolver-configs.md new file mode 100644 index 00000000000..b917ee2cc58 --- /dev/null +++ b/website/docs/reference/resource-configs/upsolver-configs.md @@ -0,0 +1,464 @@ +--- +title: "Upsolver configurations" +id: "upsolver-configs" +description: "Upsolver Configurations - Read this in-depth guide to learn about configurations in dbt." +--- + +## Supported Upsolver SQLake functionality + +| COMMAND | STATE | MATERIALIZED | +| ------ | ------ | ------ | +| SQL compute cluster| not supported | - | +| SQL connections| supported | connection | +| SQL copy job | supported | incremental | +| SQL merge job | supported | incremental | +| SQL insert job | supported | incremental | +| SQL materialized views | supported | materializedview | +| Expectations | supported | incremental | + +## Configs materialization + +| Config | Required | Materialization | Description | Example | +| ------ | --------- | --------------- | ---------- | ------- | +| connection_type | Yes | connection | Connection identifier: S3/GLUE_CATALOG/KINESIS | connection_type='S3' | +| connection_options | Yes | connection | Dictionary of options supported by selected connection | connection_options={ 'aws_role': 'aws_role', 'external_id': 'SAMPLES', 'read_only': True } | +| incremental_strategy | No | incremental | Define one of incremental strategies: merge/copy/insert. Default: copy | incremental_strategy='merge' | +| source | No | incremental | Define source to copy from: S3/KAFKA/KINESIS | source = 'S3' | +| target_type | No | incremental | Define target type REDSHIFT/ELASTICSEARCH/S3/SNOWFLAKE/POSTGRES. Default None for Data lake | target_type='Snowflake' | +| target_prefix | False | incremental | Define PREFIX for ELASTICSEARCH target type | target_prefix = 'orders' | +| target_location | False | incremental | Define LOCATION for S3 target type | target_location = 's3://your-bucket-name/path/to/folder/' | +| schema | Yes/No | incremental | Define target schema. Required if target_type, no table created in a metastore connection | schema = 'target_schema' | +| database | Yes/No | incremental | Define target connection. Required if target_type, no table created in a metastore connection | database = 'target_connection' | +| alias | Yes/No | incremental | Define target table. Required if target_type, no table created in a metastore connection | alias = 'target_table' | +| delete_condition | No | incremental | Records that match the ON condition and a delete condition can be deleted | delete_condition='nettotal > 1000' | +| partition_by | No | incremental | List of dictionaries to define partition_by for target metastore table | partition_by=[{'field':'$field_name'}] | +| primary_key | No | incremental | List of dictionaries to define partition_by for target metastore table | primary_key=[{'field':'customer_email', 'type':'string'}] | +| map_columns_by_name | No | incremental | Maps columns from the SELECT statement to the table. Boolean. Default: False | map_columns_by_name=True | +| sync | No | incremental/materializedview | Boolean option to define if job is synchronized or non-msynchronized. Default: False | sync=True | +| options | No | incremental/materializedview | Dictionary of job options | options={ 'START_FROM': 'BEGINNING', 'ADD_MISSING_COLUMNS': True } | + +## SQL connection + +Connections are used to provide Upsolver with the proper credentials to bring your data into SQLake as well as to write out your transformed data to various services. More details on ["Upsolver SQL connections"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-connections) +As a dbt model connection is a model with materialized='connection' + +```sql +{{ config( + materialized='connection', + connection_type={ 'S3' | 'GLUE_CATALOG' | 'KINESIS' | 'KAFKA'| 'SNOWFLAKE' }, + connection_options={} + ) +}} +``` + +Running this model will compile CREATE CONNECTION(or ALTER CONNECTION if exists) SQL and send it to Upsolver engine. Name of the connection will be name of the model. + +## SQL copy job + +A COPY FROM job allows you to copy your data from a given source into a table created in a metastore connection. This table then serves as your staging table and can be used with SQLake transformation jobs to write to various target locations. More details on ["Upsolver SQL copy-from"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/copy-from) + +As a dbt model copy job is model with materialized='incremental' + +```sql +{{ config( materialized='incremental', + sync=True|False, + source = 'S3'| 'KAFKA' | ... , + options={ + 'option_name': 'option_value' + }, + partition_by=[{}] + ) +}} +SELECT * FROM {{ ref() }} +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake (or ALTER TABLE if exists) and CREATE COPY JOB(or ALTER COPY JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL insert job + +An INSERT job defines a query that pulls in a set of data based on the given SELECT statement and inserts it into the designated target. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL insert"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/insert). + +As a dbt model insert job is model with materialized='incremental' and incremental_strategy='insert' + +```sql +{{ config( materialized='incremental', + sync=True|False, + map_columns_by_name=True|False, + incremental_strategy='insert', + options={ + 'option_name': 'option_value' + }, + primary_key=[{}] + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +HAVING COUNT(DISTINCT orderid::string) ... +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE INSERT JOB(or ALTER INSERT JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL merge job + +A MERGE job defines a query that pulls in a set of data based on the given SELECT statement and inserts into, replaces, or deletes the data from the designated target based on the job definition. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL merge"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/merge). + +As a dbt model merge job is model with materialized='incremental' and incremental_strategy='merge' + +```sql +{{ config( materialized='incremental', + sync=True|False, + map_columns_by_name=True|False, + incremental_strategy='merge', + options={ + 'option_name': 'option_value' + }, + primary_key=[{}] + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +HAVING COUNT ... +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE MERGE JOB(or ALTER MERGE JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL materialized views + +When transforming your data, you may find that you need data from multiple source tables in order to achieve your desired result. +In such a case, you can create a materialized view from one SQLake table in order to join it with your other table (which in this case is considered the main table). More details on ["Upsolver SQL materialized views"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/sql-materialized-views). + +As a dbt model materialized views is model with materialized='materializedview'. + +```sql +{{ config( materialized='materializedview', + sync=True|False, + options={'option_name': 'option_value'} + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +``` + +Running this model will compile CREATE MATERIALIZED VIEW SQL(or ALTER MATERIALIZED VIEW if exists) and send it to Upsolver engine. Name of the materializedview will be name of the model. + +## Expectations/constraints + +Data quality conditions can be added to your job to drop a row or trigger a warning when a column violates a predefined condition. + +```sql +WITH EXPECTATION EXPECT +ON VIOLATION WARN +``` + +Expectations can be implemented with dbt constraints +Supported constraints: check and not_null + +```yaml +models: + - name: + # required + config: + contract: + enforced: true + # model-level constraints + constraints: + - type: check + columns: ['', ''] + expression: "column1 <= column2" + name: + - type: not_null + columns: ['column1', 'column2'] + name: + + columns: + - name: + data_type: string + + # column-level constraints + constraints: + - type: not_null + - type: check + expression: "REGEXP_LIKE(, '^[0-9]{4}[a-z]{5}$')" + name: +``` + +## Projects examples + +> projects examples link: [github.com/dbt-upsolver/examples/](https://github.com/Upsolver/dbt-upsolver/tree/main/examples) + +## Connection options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| aws_role | s3 | True | True | 'aws_role': `''` | +| external_id | s3 | True | True | 'external_id': `''` | +| aws_access_key_id | s3 | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | s3 | True | True | 'aws_secret_access_key_id': `''` | +| path_display_filter | s3 | True | True | 'path_display_filter': `''` | +| path_display_filters | s3 | True | True | 'path_display_filters': (`''`, ...) | +| read_only | s3 | True | True | 'read_only': True/False | +| encryption_kms_key | s3 | True | True | 'encryption_kms_key': `''` | +| encryption_customer_managed_key | s3 | True | True | 'encryption_customer_kms_key': `''` | +| comment | s3 | True | True | 'comment': `''` | +| host | kafka | False | False | 'host': `''` | +| hosts | kafka | False | False | 'hosts': (`''`, ...) | +| consumer_properties | kafka | True | True | 'consumer_properties': `''` | +| version | kafka | False | True | 'version': `''` | +| require_static_ip | kafka | True | True | 'require_static_ip': True/False | +| ssl | kafka | True | True | 'ssl': True/False | +| topic_display_filter | kafka | True | True | 'topic_display_filter': `''` | +| topic_display_filters | kafka | True | True | 'topic_display_filter': (`''`, ...) | +| comment | kafka | True | True | 'comment': `''` | +| aws_role | glue_catalog | True | True | 'aws_role': `''` | +| external_id | glue_catalog | True | True | 'external_id': `''` | +| aws_access_key_id | glue_catalog | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | glue_catalog | True | True | 'aws_secret_access_key': `''` | +| default_storage_connection | glue_catalog | False | False | 'default_storage_connection': `''` | +| default_storage_location | glue_catalog | False | False | 'default_storage_location': `''` | +| region | glue_catalog | False | True | 'region': `''` | +| database_display_filter | glue_catalog | True | True | 'database_display_filter': `''` | +| database_display_filters | glue_catalog | True | True | 'database_display_filters': (`''`, ...) | +| comment | glue_catalog | True | True | 'comment': `''` | +| aws_role | kinesis | True | True | 'aws_role': `''` | +| external_id | kinesis | True | True | 'external_id': `''` | +| aws_access_key_id | kinesis | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | kinesis | True | True | 'aws_secret_access_key': `''` | +| region | kinesis | False | False | 'region': `''` | +| read_only | kinesis | False | True | 'read_only': True/False | +| max_writers | kinesis | True | True | 'max_writers': `` | +| stream_display_filter | kinesis | True | True | 'stream_display_filter': `''` | +| stream_display_filters | kinesis | True | True | 'stream_display_filters': (`''`, ...) | +| comment | kinesis | True | True | 'comment': `''` | +| connection_string | snowflake | True | False | 'connection_string': `''` | +| user_name | snowflake | True | False | 'user_name': `''` | +| password | snowflake | True | False | 'password': `''` | +| max_concurrent_connections | snowflake | True | True | 'max_concurrent_connections': `` | +| comment | snowflake | True | True | 'comment': `''` | +| connection_string | redshift | True | False | 'connection_string': `''` | +| user_name | redshift | True | False | 'user_name': `''` | +| password | redshift | True | False | 'password': `''` | +| max_concurrent_connections | redshift | True | True | 'max_concurrent_connections': `` | +| comment | redshift | True | True | 'comment': `''` | +| connection_string | mysql | True | False | 'connection_string': `''` | +| user_name | mysql | True | False | 'user_name': `''` | +| password | mysql | True | False | 'password': `''` | +| comment | mysql | True | True | 'comment': `''` | +| connection_string | postgres | True | False | 'connection_string': `''` | +| user_name | postgres | True | False | 'user_name': `''` | +| password | postgres | True | False | 'password': `''` | +| comment | postgres | True | True | 'comment': `''` | +| connection_string | elasticsearch | True | False | 'connection_string': `''` | +| user_name | elasticsearch | True | False | 'user_name': `''` | +| password | elasticsearch | True | False | 'password': `''` | +| comment | elasticsearch | True | True | 'comment': `''` | +| connection_string | mongodb | True | False | 'connection_string': `''` | +| user_name | mongodb | True | False | 'user_name': `''` | +| password | mongodb | True | False | 'password': `''` | +| timeout | mongodb | True | True | 'timeout': "INTERVAL 'N' SECONDS" | +| comment | mongodb | True | True | 'comment': `''` | +| connection_string | mssql | True | False | 'connection_string': `''` | +| user_name | mssql | True | False | 'user_name': `''` | +| password | mssql | True | False | 'password': `''` | +| comment | mssql | True | True | 'comment': `''` | + +## Target options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| globally_unique_keys | datalake | False | True | 'globally_unique_keys': True/False | +| storage_connection | datalake | False | True | 'storage_connection': `''` | +| storage_location | datalake | False | True | 'storage_location': `''` | +| compute_cluster | datalake | True | True | 'compute_cluster': `''` | +| compression | datalake | True | True | 'compression': 'SNAPPY/GZIP' | +| compaction_processes | datalake | True | True | 'compaction_processes': `` | +| disable_compaction | datalake | True | True | 'disable_compaction': True/False | +| retention_date_partition | datalake | False | True | 'retention_date_partition': `''` | +| table_data_retention | datalake | True | True | 'table_data_retention': `''` | +| column_data_retention | datalake | True | True | 'column_data_retention': ({'COLUMN' : `''`,'DURATION': `''`}) | +| comment | datalake | True | True | 'comment': `''` | +| storage_connection | materialized_view | False | True | 'storage_connection': `''` | +| storage_location | materialized_view | False | True | 'storage_location': `''` | +| max_time_travel_duration | materialized_view | True | True | 'max_time_travel_duration': `''` | +| compute_cluster | materialized_view | True | True | 'compute_cluster': `''` | +| column_transformations | snowflake | False | True | 'column_transformations': {`''` : `''` , ...} | +| deduplicate_with | snowflake | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| exclude_columns | snowflake | False | True | 'exclude_columns': (`''`, ...) | +| create_table_if_missing | snowflake | False | True | 'create_table_if_missing': True/False} | +| run_interval | snowflake | False | True | 'run_interval': `''` | + +## Transformation options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| run_interval | s3 | False | True | 'run_interval': `''` | +| start_from | s3 | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | s3 | True | True | 'end_at': `'/NOW'` | +| compute_cluster | s3 | True | True | 'compute_cluster': `''` | +| comment | s3 | True | True | 'comment': `''` | +| skip_validations | s3 | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | s3 | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | s3 | True | True | 'aggregation_parallelism': `` | +| run_parallelism | s3 | True | True | 'run_parallelism': `` | +| file_format | s3 | False | False | 'file_format': '(type = ``)' | +| compression | s3 | False | True | 'compression': 'SNAPPY/GZIP ...' | +| date_pattern | s3 | False | True | 'date_pattern': `''` | +| output_offset | s3 | False | True | 'output_offset': `''` | +| run_interval | elasticsearch | False | True | 'run_interval': `''` | +| routing_field_name | elasticsearch | True | True | 'routing_field_name': `''` | +| start_from | elasticsearch | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | elasticsearch | True | True | 'end_at': `'/NOW'` | +| compute_cluster | elasticsearch | True | True | 'compute_cluster': `''` | +| skip_validations | elasticsearch | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | elasticsearch | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | elasticsearch | True | True | 'aggregation_parallelism': `` | +| run_parallelism | elasticsearch | True | True | 'run_parallelism': `` | +| bulk_max_size_bytes | elasticsearch | True | True | 'bulk_max_size_bytes': `` | +| index_partition_size | elasticsearch | True | True | 'index_partition_size': 'HOURLY/DAILY ...' | +| comment | elasticsearch | True | True | 'comment': `''` | +| custom_insert_expressions | snowflake | True | True | 'custom_insert_expressions': {'INSERT_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} | +| custom_update_expressions | snowflake | True | True | 'custom_update_expressions': {'UPDATE_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} | +| keep_existing_values_when_null | snowflake | True | True | 'keep_existing_values_when_null': True/False | +| add_missing_columns | snowflake | False | True | 'add_missing_columns': True/False | +| run_interval | snowflake | False | True | 'run_interval': `''` | +| commit_interval | snowflake | True | True | 'commit_interval': `''` | +| start_from | snowflake | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | snowflake | True | True | 'end_at': `'/NOW'` | +| compute_cluster | snowflake | True | True | 'compute_cluster': `''` | +| skip_validations | snowflake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | snowflake | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | snowflake | True | True | 'aggregation_parallelism': `` | +| run_parallelism | snowflake | True | True | 'run_parallelism': `` | +| comment | snowflake | True | True | 'comment': `''` | +| add_missing_columns | datalake | False | True | 'add_missing_columns': True/False | +| run_interval | datalake | False | True | 'run_interval': `''` | +| start_from | datalake | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | datalake | True | True | 'end_at': `'/NOW'` | +| compute_cluster | datalake | True | True | 'compute_cluster': `''` | +| skip_validations | datalake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | datalake | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | datalake | True | True | 'aggregation_parallelism': `` | +| run_parallelism | datalake | True | True | 'run_parallelism': `` | +| comment | datalake | True | True | 'comment': `''` | +| run_interval | redshift | False | True | 'run_interval': `''` | +| start_from | redshift | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | redshift | True | True | 'end_at': `'/NOW'` | +| compute_cluster | redshift | True | True | 'compute_cluster': `''` | +| skip_validations | redshift | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | redshift | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | redshift | True | True | 'aggregation_parallelism': `` | +| run_parallelism | redshift | True | True | 'run_parallelism': `` | +| skip_failed_files | redshift | False | True | 'skip_failed_files': True/False | +| fail_on_write_error | redshift | False | True | 'fail_on_write_error': True/False | +| comment | redshift | True | True | 'comment': `''` | +| run_interval | postgres | False | True | 'run_interval': `''` | +| start_from | postgres | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | postgres | True | True | 'end_at': `'/NOW'` | +| compute_cluster | postgres | True | True | 'compute_cluster': `''` | +| skip_validations | postgres | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | postgres | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | postgres | True | True | 'aggregation_parallelism': `` | +| run_parallelism | postgres | True | True | 'run_parallelism': `` | +| comment | postgres | True | True | 'comment': `''` | + +## Copy options + +| Option | Storage | Category | Editable | Optional | Config Syntax | +| -------| ---------- | -------- | -------- | -------- | ------------- | +| topic | kafka | source_options | False | False | 'topic': `''` | +| exclude_columns | kafka | job_options | False | True | 'exclude_columns': (`''`, ...) | +| deduplicate_with | kafka | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| consumer_properties | kafka | job_options | True | True | 'consumer_properties': `''` | +| reader_shards | kafka | job_options | True | True | 'reader_shards': `` | +| store_raw_data | kafka | job_options | False | True | 'store_raw_data': True/False | +| start_from | kafka | job_options | False | True | 'start_from': 'BEGINNING/NOW' | +| end_at | kafka | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | kafka | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | kafka | job_options | True | True | 'run_parallelism': `` | +| content_type | kafka | job_options | True | True | 'content_type': 'AUTO/CSV/...' | +| compression | kafka | job_options | False | True | 'compression': 'AUTO/GZIP/...' | +| column_transformations | kafka | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| commit_interval | kafka | job_options | True | True | 'commit_interval': `''` | +| skip_validations | kafka | job_options | False | True | 'skip_validations': ('MISSING_TOPIC') | +| skip_all_validations | kafka | job_options | False | True | 'skip_all_validations': True/False | +| comment | kafka | job_options | True | True | 'comment': `''` | +| table_include_list | mysql | source_options | True | True | 'table_include_list': (`''`, ...) | +| column_exclude_list | mysql | source_options | True | True | 'column_exclude_list': (`''`, ...) | +| exclude_columns | mysql | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mysql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mysql | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mysql | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mysql | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mysql | job_options | True | True | 'snapshot_parallelism': `` | +| ddl_filters | mysql | job_options | False | True | 'ddl_filters': (`''`, ...) | +| comment | mysql | job_options | True | True | 'comment': `''` | +| table_include_list | postgres | source_options | False | False | 'table_include_list': (`''`, ...) | +| column_exclude_list | postgres | source_options | False | True | 'column_exclude_list': (`''`, ...) | +| heartbeat_table | postgres | job_options | False | True | 'heartbeat_table': `''` | +| skip_snapshots | postgres | job_options | False | True | 'skip_snapshots': True/False | +| publication_name | postgres | job_options | False | False | 'publication_name': `''` | +| end_at | postgres | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | postgres | job_options | True | True | 'compute_cluster': `''` | +| comment | postgres | job_options | True | True | 'comment': `''` | +| parse_json_columns | postgres | job_options | False | False | 'parse_json_columns': True/False | +| column_transformations | postgres | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| snapshot_parallelism | postgres | job_options | True | True | 'snapshot_parallelism': `` | +| exclude_columns | postgres | job_options | False | True | 'exclude_columns': (`''`, ...) | +| location | s3 | source_options | False | False | 'location': `''` | +| date_pattern | s3 | job_options | False | True | 'date_pattern': `''` | +| file_pattern | s3 | job_options | False | True | 'file_pattern': `''` | +| initial_load_pattern | s3 | job_options | False | True | 'initial_load_pattern': `''` | +| initial_load_prefix | s3 | job_options | False | True | 'initial_load_prefix': `''` | +| delete_files_after_load | s3 | job_options | False | True | 'delete_files_after_load': True/False | +| deduplicate_with | s3 | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| end_at | s3 | job_options | True | True | 'end_at': `'/NOW'` | +| start_from | s3 | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` | +| compute_cluster | s3 | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | s3 | job_options | True | True | 'run_parallelism': `` | +| content_type | s3 | job_options | True | True | 'content_type': 'AUTO/CSV...' | +| compression | s3 | job_options | False | True | 'compression': 'AUTO/GZIP...' | +| comment | s3 | job_options | True | True | 'comment': `''` | +| column_transformations | s3 | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| commit_interval | s3 | job_options | True | True | 'commit_interval': `''` | +| skip_validations | s3 | job_options | False | True | 'skip_validations': ('EMPTY_PATH') | +| skip_all_validations | s3 | job_options | False | True | 'skip_all_validations': True/False | +| exclude_columns | s3 | job_options | False | True | 'exclude_columns': (`''`, ...) | +| stream | kinesis | source_options | False | False | 'stream': `''` | +| reader_shards | kinesis | job_options | True | True | 'reader_shards': `` | +| store_raw_data | kinesis | job_options | False | True | 'store_raw_data': True/False | +| start_from | kinesis | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | kinesis | job_options | False | True | 'end_at': `'/NOW'` | +| compute_cluster | kinesis | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | kinesis | job_options | False | True | 'run_parallelism': `` | +| content_type | kinesis | job_options | True | True | 'content_type': 'AUTO/CSV...' | +| compression | kinesis | job_options | False | True | 'compression': 'AUTO/GZIP...' | +| comment | kinesis | job_options | True | True | 'comment': `''` | +| column_transformations | kinesis | job_options | True | True | 'column_transformations': {`''` : `''` , ...} | +| deduplicate_with | kinesis | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| commit_interval | kinesis | job_options | True | True | 'commit_interval': `''` | +| skip_validations | kinesis | job_options | False | True | 'skip_validations': ('MISSING_STREAM') | +| skip_all_validations | kinesis | job_options | False | True | 'skip_all_validations': True/False | +| exclude_columns | kinesis | job_options | False | True | 'exclude_columns': (`''`, ...) | +| table_include_list | mssql | source_options | True | True | 'table_include_list': (`''`, ...) | +| column_exclude_list | mssql | source_options | True | True | 'column_exclude_list': (`''`, ...) | +| exclude_columns | mssql | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mssql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mssql | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mssql | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mssql | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mssql | job_options | True | True | 'snapshot_parallelism': `` | +| parse_json_columns | mssql | job_options | False | False | 'parse_json_columns': True/False | +| comment | mssql | job_options | True | True | 'comment': `''` | +| collection_include_list | mongodb | source_options | True | True | 'collection_include_list': (`''`, ...) | +| exclude_columns | mongodb | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mongodb | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mongodb | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mongodb | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mongodb | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mongodb | job_options | True | True | 'snapshot_parallelism': `` | +| comment | mongodb | job_options | True | True | 'comment': `''` | diff --git a/website/docs/reference/resource-configs/where.md b/website/docs/reference/resource-configs/where.md index b0953e6f3d4..dbb3b66e901 100644 --- a/website/docs/reference/resource-configs/where.md +++ b/website/docs/reference/resource-configs/where.md @@ -3,13 +3,6 @@ resource_types: [tests] datatype: string --- - - -* `v0.20.0`: Introduced `where` config -* `v0.21.0`: Introduced `config` property for tests. Reimplemented `where` config with `get_where_subquery` macro - - - ### Definition Filter the resource being tested (model, source, seed, or snapshot). diff --git a/website/docs/reference/resource-properties/access.md b/website/docs/reference/resource-properties/access.md deleted file mode 100644 index 42b9893ed7f..00000000000 --- a/website/docs/reference/resource-properties/access.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -resource_types: [models] -datatype: access -required: no ---- - -:::info New functionality -This functionality is new in v1.5. -::: - - - -```yml -version: 2 - -models: - - name: model_name - access: private | protected | public -``` - - - -Access modifiers may be applied to models one-by-one in YAML properties. It is not currently possible to configure `access` for multiple models at once. A group or subfolder contains models with a variety of access levels, and designating a model with `access: public` should always be a conscious and intentional choice. - -## Definition -The access level of the model you are declaring properties for. - -Some models (not all) are designed to be referenced through the [ref](/reference/dbt-jinja-functions/ref) function across [groups](/docs/build/groups). - -| Access | Referenceable by | -|-----------|-------------------------------| -| private | same group | -| protected | same project/package | -| public | any group, package or project | - -If you try to reference a model outside of its supported access, you will see an error: - -```shell -dbt run -s marketing_model -... -dbt.exceptions.DbtReferenceError: Parsing Error - Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, - which is not allowed because the referenced node is private to the finance group. -``` - -## Default - -By default, all models are "protected." This means that other models in the same project can reference them. - -## Related docs - -* [Model Access](/docs/collaborate/govern/model-access#groups) -* [Group configuration](/reference/resource-configs/group) diff --git a/website/docs/reference/resource-properties/config.md b/website/docs/reference/resource-properties/config.md index 32143c1da07..55d2f64d9ff 100644 --- a/website/docs/reference/resource-properties/config.md +++ b/website/docs/reference/resource-properties/config.md @@ -16,6 +16,7 @@ datatype: "{dictionary}" { label: 'Sources', value: 'sources', }, { label: 'Metrics', value: 'metrics', }, { label: 'Exposures', value: 'exposures', }, + { label: 'Semantic models', value: 'semantic models', }, ] }> @@ -108,13 +109,6 @@ version: 2 - - -We have added support for the `config` property on sources in dbt Core v1.1 - - - - @@ -133,8 +127,6 @@ sources: - - @@ -191,6 +183,36 @@ exposures: + + + + +Support for the `config` property on `semantic_models` was added in dbt Core v1.7 + + + + + + + +```yml +version: 2 + +semantic_models: + - name: + config: + enabled: true | false + group: + meta: {dictionary} +``` + + + + + + + +## Definition The `config` property allows you to configure resources at the same time you're defining properties in YAML files. diff --git a/website/docs/reference/resource-properties/constraints.md b/website/docs/reference/resource-properties/constraints.md index 51e10e028ab..9a5d513d99b 100644 --- a/website/docs/reference/resource-properties/constraints.md +++ b/website/docs/reference/resource-properties/constraints.md @@ -20,7 +20,7 @@ Constraints require the declaration and enforcement of a model [contract](/refer Constraints may be defined for a single column, or at the model level for one or more columns. As a general rule, we recommend defining single-column constraints directly on those columns. The structure of a constraint is: -- `type` (required): one of `not_null`, `primary_key`, `foreign_key`, `check`, `custom` +- `type` (required): one of `not_null`, `unique`, `primary_key`, `foreign_key`, `check`, `custom` - `expression`: Free text input to qualify the constraint. Required for certain constraint types, and optional for others. - `name` (optional): Human-friendly name for this constraint. Supported by some data platforms. - `columns` (model-level only): List of column names to apply the constraint over @@ -53,6 +53,9 @@ models: # column-level constraints constraints: - type: not_null + - type: unique + - type: foreign_key + expression: . () - type: ... ``` @@ -100,7 +103,7 @@ models: contract: enforced: true columns: - - name: customer_id + - name: id data_type: int constraints: - type: not_null @@ -350,7 +353,62 @@ models:
      -Expected DDL to enforce constraints: +### Column-level constraint on nested column: + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 'string' as a, + struct( + 1 as id, + 'name' as name, + struct(2 as id, struct('test' as again, '2' as even_more) as another) as double_nested + ) as b +``` + + + + + +```yml +version: 2 + +models: + - name: nested_column_constraints_example + config: + contract: + enforced: true + columns: + - name: a + data_type: string + - name: b.id + data_type: integer + constraints: + - type: not_null + - name: b.name + description: test description + data_type: string + - name: b.double_nested.id + data_type: integer + - name: b.double_nested.another.again + data_type: string + - name: b.double_nested.another.even_more + data_type: integer + constraints: + - type: not_null +``` + + + +### Expected DDL to enforce constraints: + ```sql diff --git a/website/docs/reference/resource-properties/freshness.md b/website/docs/reference/resource-properties/freshness.md index ae39a764cc1..f332f5a1b8f 100644 --- a/website/docs/reference/resource-properties/freshness.md +++ b/website/docs/reference/resource-properties/freshness.md @@ -88,13 +88,6 @@ This is particularly useful if: - You are using Snowflake, Databricks or Spark with large tables, and this results in a performance benefit - - -* `v0.15.0`: This property was introduced - - - - ## Examples ### Complete example diff --git a/website/docs/reference/resource-properties/quote.md b/website/docs/reference/resource-properties/quote.md index 3552d1d3d3a..50bf4c08c40 100644 --- a/website/docs/reference/resource-properties/quote.md +++ b/website/docs/reference/resource-properties/quote.md @@ -115,12 +115,6 @@ analyses: ## Definition The `quote` field can be used to enable or disable quoting for column names. - - -* `v0.16.0`: This configuration was added - - - ## Default The default quoting value is `false` diff --git a/website/docs/reference/resource-properties/tests.md b/website/docs/reference/resource-properties/tests.md index f25e5306542..0fe86ccc57d 100644 --- a/website/docs/reference/resource-properties/tests.md +++ b/website/docs/reference/resource-properties/tests.md @@ -298,9 +298,7 @@ models: -Check out the guide on writing a [custom generic test](/guides/best-practices/writing-custom-generic-tests) for more information. - - +Check out the guide on writing a [custom generic test](/best-practices/writing-custom-generic-tests) for more information. ### Custom test name @@ -438,10 +436,6 @@ $ dbt test **If using [`store_failures`](/reference/resource-configs/store_failures):** dbt uses each test's name as the name of the table in which to store any failing records. If you have defined a custom name for one test, that custom name will also be used for its table of failures. You may optionally configure an [`alias`](/reference/resource-configs/alias) for the test, to separately control both the name of the test (for metadata) and the name of its database table (for storing failures). - - - - ### Alternative format for defining tests When defining a generic test with several arguments and configurations, the YAML can look and feel unwieldy. If you find it easier, you can define the same test properties as top-level keys of a single dictionary, by providing the test name as `test_name` instead. It's totally up to you. @@ -470,5 +464,3 @@ models: ``` - - diff --git a/website/docs/reference/resource-properties/versions.md b/website/docs/reference/resource-properties/versions.md index 7e107ff31e3..86e9abf34a8 100644 --- a/website/docs/reference/resource-properties/versions.md +++ b/website/docs/reference/resource-properties/versions.md @@ -2,8 +2,12 @@ resource_types: [models] datatype: list required: no +keyword: governance, model version, model versioning, dbt model versioning --- +import VersionsCallout from '/snippets/_version-callout.md'; + + @@ -61,3 +65,62 @@ Note that the value of `defined_in` and the `alias` configuration of a model are - Follow a consistent naming convention for model versions and aliases. - Use `defined_in` and `alias` only if you have good reason. - Create a view that always points to the latest version of your model. You can automate this for all versioned models in your project with an `on-run-end` hook. For more details, read the full docs on ["Model versions"](/docs/collaborate/govern/model-versions#configuring-database-location-with-alias) + +### Detecting breaking changes + +When you use the `state:modified` selection method in Slim CI, dbt will detect changes to versioned model contracts, and raise an error if any of those changes could be breaking for downstream consumers. + +Breaking changes include: +- Removing an existing column +- Changing the `data_type` of an existing column +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) +- Changing unversioned, contracted models. + - dbt also warns if a model has or had a contract but isn't versioned + + + + + +``` + Breaking Change to Unversioned Contract for contracted_model (models/contracted_models/contracted_model.sql) + While comparing to previous project state, dbt detected a breaking change to an unversioned model. + - Contract enforcement was removed: Previously, this model's configuration included contract: {enforced: true}. It is no longer configured to enforce its contract, and this is a breaking change. + - Columns were removed: + - color + - date_day + - Enforced column level constraints were removed: + - id (ConstraintType.not_null) + - id (ConstraintType.primary_key) + - Enforced model level constraints were removed: + - ConstraintType.check -> ['id'] + - Materialization changed with enforced constraints: + - table -> view +``` + + + + +``` +Breaking Change to Contract Error in model sometable (models/sometable.sql) + While comparing to previous project state, dbt detected a breaking change to an enforced contract. + + The contract's enforcement has been disabled. + + Columns were removed: + - order_name + + Columns with data_type changes: + - order_id (number -> int) + + Consider making an additive (non-breaking) change instead, if possible. + Otherwise, create a new model version: https://docs.getdbt.com/docs/collaborate/govern/model-versions +``` + + + + + + +Additive changes are **not** considered breaking: +- Adding a new column to a contracted model +- Adding new `constraints` to an existing column in a contracted model diff --git a/website/docs/reference/seed-configs.md b/website/docs/reference/seed-configs.md index d74f414cbfe..429aa9444ae 100644 --- a/website/docs/reference/seed-configs.md +++ b/website/docs/reference/seed-configs.md @@ -23,6 +23,7 @@ seeds: [](/reference/resource-configs/resource-path): [+](/reference/resource-configs/plus-prefix)[quote_columns](/reference/resource-configs/quote_columns): true | false [+](/reference/resource-configs/plus-prefix)[column_types](/reference/resource-configs/column_types): {column_name: datatype} + [+](/reference/resource-configs/plus-prefix)[delimiter](/reference/resource-configs/delimiter): ``` @@ -43,6 +44,7 @@ seeds: config: [quote_columns](/reference/resource-configs/quote_columns): true | false [column_types](/reference/resource-configs/column_types): {column_name: datatype} + [delimiter](/reference/resource-configs/grants): ``` diff --git a/website/docs/reference/seed-properties.md b/website/docs/reference/seed-properties.md index d8b72737646..85e7be21ae1 100644 --- a/website/docs/reference/seed-properties.md +++ b/website/docs/reference/seed-properties.md @@ -2,12 +2,6 @@ title: Seed properties --- - - - **v1.0.0:** The default path for [`seed-paths`](/reference/project-configs/seed-paths) (formerly `data-paths`) is now `seeds`. - - - Seed properties can be declared in `.yml` files under a `seed` key. We recommend that you put them in the `seeds/` directory. You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within that directory. @@ -42,9 +36,3 @@ seeds: - name: ... # declare properties of additional seeds ``` - - - -* `v0.16.0`: The ability to declare seed properties was introduced. Prior to this, you could declare seed properties under the `models:` key (confusing, right?). Support for declaring seed properties under a `models:` key will be removed in a future release. - - diff --git a/website/docs/reference/snapshot-properties.md b/website/docs/reference/snapshot-properties.md index 48c5328a400..301747e9325 100644 --- a/website/docs/reference/snapshot-properties.md +++ b/website/docs/reference/snapshot-properties.md @@ -40,9 +40,3 @@ snapshots: ``` - - - -* `v0.16.0`: The ability to declare snapshot properties was introduced. - - diff --git a/website/docs/reference/snowflake-permissions.md b/website/docs/reference/snowflake-permissions.md deleted file mode 100644 index 6a469d12230..00000000000 --- a/website/docs/reference/snowflake-permissions.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "Snowflake Permissions" ---- - -## Example Snowflake permissions - -``` --- NOTE: warehouse_name, database_name, and role_name are placeholders! --- Replace as-needed for your organization's naming convention! - -grant all on warehouse warehouse_name to role role_name; -grant usage on database database_name to role role_name; -grant create schema on database database_name to role role_name; -grant usage on schema database.an_existing_schema to role role_name; -grant create table on schema database.an_existing_schema to role role_name; -grant create view on schema database.an_existing_schema to role role_name; -grant usage on future schemas in database database_name to role role_name; -grant monitor on future schemas in database database_name to role role_name; -grant select on future tables in database database_name to role role_name; -grant select on future views in database database_name to role role_name; -grant usage on all schemas in database database_name to role role_name; -grant monitor on all schemas in database database_name to role role_name; -grant select on all tables in database database_name to role role_name; -grant select on all views in database database_name to role role_name; -``` diff --git a/website/docs/reference/source-configs.md b/website/docs/reference/source-configs.md index ef428f5934c..7e8a547489a 100644 --- a/website/docs/reference/source-configs.md +++ b/website/docs/reference/source-configs.md @@ -1,5 +1,5 @@ --- -title: "About source configurations" +title: Source configurations description: "Learn how to use source configurations in dbt." id: source-configs --- @@ -37,8 +37,6 @@ sources: - - ```yaml @@ -57,27 +55,31 @@ sources: - - ## Configuring sources - - -Sources can be configured via a `config:` block within their `.yml` definitions, or from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](/docs/build/packages). You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. +Sources can be configured via a `config:` block within their `.yml` definitions, or from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](/docs/build/packages). - +You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. - +- **Note**: To disable a source table nested in a YAML file in a subfolder, you will need to supply the subfolder(s) within the path to that YAML file, as well as the source name and the table name in the `dbt_project.yml` file.

      + The following example shows how to disable a source table nested in a YAML file in a subfolder: -Sources can be configured from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](package-management). You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. + -Unlike other resource types, sources do not yet support a `config` property. It is not possible to (re)define source configs hierarchically across multiple YAML files. + ```yaml + sources: + your_project_name: + subdirectory_name: + source_name: + source_table_name: + +enabled: false + ``` + -
      ### Examples #### Disable all sources imported from a package @@ -97,8 +99,6 @@ sources: - - #### Conditionally enable a single source When defining a source, you can disable the entire source, or specific source tables, using the inline `config` property: @@ -138,8 +138,6 @@ sources: - - #### Disable a single source from a package To disable a specific source from another package, qualify the resource path for your configuration with both a package name and a source name. In this case, we're disabling the `clickstream` source from the `events` package. diff --git a/website/docs/reference/source-properties.md b/website/docs/reference/source-properties.md index d20ef5f2877..d107881967e 100644 --- a/website/docs/reference/source-properties.md +++ b/website/docs/reference/source-properties.md @@ -1,5 +1,5 @@ --- -title: "About source properties" +title: "Source properties" description: "Learn how to use source properties in dbt." --- @@ -8,9 +8,13 @@ description: "Learn how to use source properties in dbt." - [Declaring resource properties](/reference/configs-and-properties) ## Overview -Source properties can be declared in `.yml` files in your `models/` directory (as defined by the [`model-paths` config](/reference/project-configs/model-paths)). -You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `models/` directory. +import PropsCallout from '/snippets/_config-prop-callout.md'; + +Source properties can be declared in any `properties.yml` file in your `models/` directory (as defined by the [`model-paths` config](/reference/project-configs/model-paths)).
      + + +You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `models/` directory: diff --git a/website/docs/sql-reference/aggregate-functions/sql-array-agg.md b/website/docs/sql-reference/aggregate-functions/sql-array-agg.md index 430be4b4316..a6f508a7bef 100644 --- a/website/docs/sql-reference/aggregate-functions/sql-array-agg.md +++ b/website/docs/sql-reference/aggregate-functions/sql-array-agg.md @@ -59,4 +59,4 @@ Looking at the query results—this makes sense! We’d expect newer orders to l There are definitely too many use cases to list out for using the ARRAY_AGG function in your dbt models, but it’s very likely that ARRAY_AGG is used pretty downstream in your since you likely don’t want your data so bundled up earlier in your DAG to improve modularity and dryness. A few downstream use cases for ARRAY_AGG: - In [`export_` models](https://www.getdbt.com/open-source-data-culture/reverse-etl-playbook) that are used to send data to platforms using a tool to pair down multiple rows into a single row. Some downstream platforms, for example, require certain values that we’d usually keep as separate rows to be one singular row per customer or user. ARRAY_AGG is handy to bring multiple column values together by a singular id, such as creating an array of all items a user has ever purchased and sending that array downstream to an email platform to create a custom email campaign. -- Similar to export models, you may see ARRAY_AGG used in [mart tables](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) to create final aggregate arrays per a singular dimension; performance concerns of ARRAY_AGG in these likely larger tables can potentially be bypassed with use of [incremental models in dbt](https://docs.getdbt.com/docs/build/incremental-models). +- Similar to export models, you may see ARRAY_AGG used in [mart tables](/best-practices/how-we-structure/4-marts) to create final aggregate arrays per a singular dimension; performance concerns of ARRAY_AGG in these likely larger tables can potentially be bypassed with use of [incremental models in dbt](/docs/build/incremental-models). diff --git a/website/docs/sql-reference/aggregate-functions/sql-avg.md b/website/docs/sql-reference/aggregate-functions/sql-avg.md index d7d2fccc3c4..d1dba119292 100644 --- a/website/docs/sql-reference/aggregate-functions/sql-avg.md +++ b/website/docs/sql-reference/aggregate-functions/sql-avg.md @@ -48,7 +48,7 @@ Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the abil ## AVG function use cases We most commonly see the AVG function used in data work to calculate: -- The average of key metrics (ex. Average CSAT, average lead time, average order amount) in downstream [fact or dim models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) +- The average of key metrics (ex. Average CSAT, average lead time, average order amount) in downstream [fact or dim models](/best-practices/how-we-structure/4-marts) - Rolling or moving averages (ex. 7-day, 30-day averages for key metrics) using window functions - Averages in [dbt metrics](https://docs.getdbt.com/docs/build/metrics) diff --git a/website/docs/sql-reference/aggregate-functions/sql-round.md b/website/docs/sql-reference/aggregate-functions/sql-round.md index 053a2ebdd8e..bc9669e22cb 100644 --- a/website/docs/sql-reference/aggregate-functions/sql-round.md +++ b/website/docs/sql-reference/aggregate-functions/sql-round.md @@ -57,7 +57,7 @@ Google BigQuery, Amazon Redshift, Snowflake, and Databricks all support the abil ## ROUND function use cases -If you find yourself rounding numeric data, either in data models or ad-hoc analyses, you’re probably rounding to improve the readability and usability of your data using downstream [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) or [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts). Specifically, you’ll likely use the ROUND function to: +If you find yourself rounding numeric data, either in data models or ad-hoc analyses, you’re probably rounding to improve the readability and usability of your data using downstream [intermediate](/best-practices/how-we-structure/3-intermediate) or [mart models](/best-practices/how-we-structure/4-marts). Specifically, you’ll likely use the ROUND function to: - Make numeric calculations using division or averages a little cleaner and easier to understand - Create concrete buckets of data for a cleaner distribution of values during ad-hoc analysis diff --git a/website/docs/sql-reference/clauses/sql-limit.md b/website/docs/sql-reference/clauses/sql-limit.md index 74cc2e12123..a02b851e37d 100644 --- a/website/docs/sql-reference/clauses/sql-limit.md +++ b/website/docs/sql-reference/clauses/sql-limit.md @@ -51,7 +51,7 @@ This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle After ensuring that this is the result you want from this query, you can omit the LIMIT in your final data model. :::tip Save money and time by limiting data in development -You could limit your data used for development by manually adding a LIMIT statement, a WHERE clause to your query, or by using a [dbt macro to automatically limit data based](https://docs.getdbt.com/guides/legacy/best-practices#limit-the-data-processed-when-in-development) on your development environment to help reduce your warehouse usage during dev periods. +You could limit your data used for development by manually adding a LIMIT statement, a WHERE clause to your query, or by using a [dbt macro to automatically limit data based](/best-practices/best-practice-workflows#limit-the-data-processed-when-in-development) on your development environment to help reduce your warehouse usage during dev periods. ::: ## LIMIT syntax in Snowflake, Databricks, BigQuery, and Redshift diff --git a/website/docs/sql-reference/clauses/sql-order-by.md b/website/docs/sql-reference/clauses/sql-order-by.md index 660794adc14..d18946d0d16 100644 --- a/website/docs/sql-reference/clauses/sql-order-by.md +++ b/website/docs/sql-reference/clauses/sql-order-by.md @@ -57,7 +57,7 @@ Since the ORDER BY clause is a SQL fundamental, data warehouses, including Snowf ## ORDER BY use cases We most commonly see the ORDER BY clause used in data work to: -- Analyze data for both initial exploration of raw data sources and ad hoc querying of [mart datasets](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) +- Analyze data for both initial exploration of raw data sources and ad hoc querying of [mart datasets](/best-practices/how-we-structure/4-marts) - Identify the top 5/10/50/100 of a dataset when used in pair with a [LIMIT](/sql-reference/limit) - (For Snowflake) Optimize the performance of large incremental models that use both a `cluster_by` [configuration](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#using-cluster_by) and ORDER BY statement - Control the ordering of window function partitions (ex. `row_number() over (partition by user_id order by updated_at)`) diff --git a/website/docs/sql-reference/joins/sql-inner-join.md b/website/docs/sql-reference/joins/sql-inner-join.md index 0cf8a3894bd..951e3675bc7 100644 --- a/website/docs/sql-reference/joins/sql-inner-join.md +++ b/website/docs/sql-reference/joins/sql-inner-join.md @@ -66,5 +66,5 @@ Because there’s no `user_id` = 4 in Table A and no `user_id` = 2 in Table B, r ## SQL inner join use cases -There are probably countless scenarios where you’d want to inner join multiple tables together—perhaps you have some really nicely structured tables with the exact same primary keys that should really just be one larger, wider table or you’re joining two tables together don’t want any null or missing column values if you used a left or right join—it’s all pretty dependent on your source data and end use cases. Where you will not (and should not) see inner joins is in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) that are used to clean and prep raw source data for analytics uses. Any joins in your dbt projects should happen further downstream in [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) to improve modularity and DAG cleanliness. +There are probably countless scenarios where you’d want to inner join multiple tables together—perhaps you have some really nicely structured tables with the exact same primary keys that should really just be one larger, wider table or you’re joining two tables together don’t want any null or missing column values if you used a left or right join—it’s all pretty dependent on your source data and end use cases. Where you will not (and should not) see inner joins is in [staging models](/best-practices/how-we-structure/2-staging) that are used to clean and prep raw source data for analytics uses. Any joins in your dbt projects should happen further downstream in [intermediate](/best-practices/how-we-structure/3-intermediate) and [mart models](/best-practices/how-we-structure/4-marts) to improve modularity and DAG cleanliness. diff --git a/website/docs/sql-reference/joins/sql-left-join.md b/website/docs/sql-reference/joins/sql-left-join.md index 841edc41cdd..914f83bb7e3 100644 --- a/website/docs/sql-reference/joins/sql-left-join.md +++ b/website/docs/sql-reference/joins/sql-left-join.md @@ -73,4 +73,4 @@ Left joins are a fundamental in data modeling and analytics engineering work—t Something to note if you use left joins: if there are multiple records for an individual key in the left join database object, be aware that duplicates can potentially be introduced in the final query result. This is where dbt tests, such as testing for uniqueness and [equal row count](https://github.com/dbt-labs/dbt-utils#equal_rowcount-source) across upstream source tables and downstream child models, can help you identify faulty data modeling logic and improve data quality. ::: -Where you will not (and should not) see left joins is in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) that are used to clean and prep raw source data for analytics uses. Any joins in your dbt projects should happen further downstream in [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) to improve modularity and cleanliness. \ No newline at end of file +Where you will not (and should not) see left joins is in [staging models](/best-practices/how-we-structure/2-staging) that are used to clean and prep raw source data for analytics uses. Any joins in your dbt projects should happen further downstream in [intermediate](/best-practices/how-we-structure/3-intermediate) and [mart models](/best-practices/how-we-structure/4-marts) to improve modularity and cleanliness. diff --git a/website/docs/sql-reference/joins/sql-self-join.md b/website/docs/sql-reference/joins/sql-self-join.md index 0eef0fcab7c..6d9a7d3261e 100644 --- a/website/docs/sql-reference/joins/sql-self-join.md +++ b/website/docs/sql-reference/joins/sql-self-join.md @@ -66,6 +66,6 @@ This query utilizing a self join adds the `parent_name` of skus that have non-nu ## SQL self join use cases -Again, self joins are probably rare in your dbt project and will most often be utilized in tables that contain a hierarchical structure, such as consisting of a column which is a foreign key to the primary key of the same table. If you do have use cases for self joins, such as in the example above, you’ll typically want to perform that self join early upstream in your , such as in a [staging](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) or [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) model; if your raw, unjoined table is going to need to be accessed further downstream sans self join, that self join should happen in a modular intermediate model. +Again, self joins are probably rare in your dbt project and will most often be utilized in tables that contain a hierarchical structure, such as consisting of a column which is a foreign key to the primary key of the same table. If you do have use cases for self joins, such as in the example above, you’ll typically want to perform that self join early upstream in your , such as in a [staging](/best-practices/how-we-structure/2-staging) or [intermediate](/best-practices/how-we-structure/3-intermediate) model; if your raw, unjoined table is going to need to be accessed further downstream sans self join, that self join should happen in a modular intermediate model. -You can also use self joins to create a cartesian product (aka a cross join) of a table against itself. Again, slim use cases, but still there for you if you need it 😉 \ No newline at end of file +You can also use self joins to create a cartesian product (aka a cross join) of a table against itself. Again, slim use cases, but still there for you if you need it 😉 diff --git a/website/docs/sql-reference/operators/sql-not.md b/website/docs/sql-reference/operators/sql-not.md index e9156cb9720..fcfa7627c0b 100644 --- a/website/docs/sql-reference/operators/sql-not.md +++ b/website/docs/sql-reference/operators/sql-not.md @@ -55,4 +55,4 @@ This simple query using the sample dataset [Jaffle Shop’s](https://github.com/ ## NOT operator example use cases -There are probably many scenarios where you’d want to use the NOT operators in your WHERE clauses or case statements, but we commonly see NOT operators used to remove nulls or boolean-identifed deleted rows in source data in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). This removal of unnecessary rows can potentially help the performance of downstream [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts). \ No newline at end of file +There are probably many scenarios where you’d want to use the NOT operators in your WHERE clauses or case statements, but we commonly see NOT operators used to remove nulls or boolean-identifed deleted rows in source data in [staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging). This removal of unnecessary rows can potentially help the performance of downstream [intermediate](https://docs.getdbt.com/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/best-practices/how-we-structure/4-marts). diff --git a/website/docs/sql-reference/other/sql-cast.md b/website/docs/sql-reference/other/sql-cast.md index cf24a12706e..9d41400e825 100644 --- a/website/docs/sql-reference/other/sql-cast.md +++ b/website/docs/sql-reference/other/sql-cast.md @@ -50,7 +50,7 @@ After running this query, the `orders` table will look a little something like t Let’s be clear: the resulting data from this query looks exactly the same as the upstream `orders` model. However, the `order_id` and `customer_id` fields are now strings, meaning you could easily concat different string variables to them. -> Casting columns to their appropriate types typically happens in our dbt project’s [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). A few reasons for that: data cleanup and standardization, such as aliasing, casting, and lower or upper casing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. +> Casting columns to their appropriate types typically happens in our dbt project’s [staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging). A few reasons for that: data cleanup and standardization, such as aliasing, casting, and lower or upper casing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. ## SQL CAST function syntax in Snowflake, Databricks, BigQuery, and Redshift @@ -66,4 +66,4 @@ You know at one point you’re going to need to cast a column to a different dat - tools [defaulting to certain data types](https://airbytehq.github.io/integrations/sources/google-sheets/) - BI tools require certain fields to be specific data types -A key thing to remember when you’re casting data is the user experience in your end BI tool: are business users expecting `customer_id` to be filtered on 1 or '1'? What is more intuitive for them? If one `id` field is an integer, all `id` fields should be integers. Just like all data modeling, consistency and standardization is key when determining when and what to cast. \ No newline at end of file +A key thing to remember when you’re casting data is the user experience in your end BI tool: are business users expecting `customer_id` to be filtered on 1 or '1'? What is more intuitive for them? If one `id` field is an integer, all `id` fields should be integers. Just like all data modeling, consistency and standardization is key when determining when and what to cast. diff --git a/website/docs/sql-reference/other/sql-comments.md b/website/docs/sql-reference/other/sql-comments.md index 811f2b4339e..7fe5e970a85 100644 --- a/website/docs/sql-reference/other/sql-comments.md +++ b/website/docs/sql-reference/other/sql-comments.md @@ -53,7 +53,7 @@ We recommend leveraging inline comments in the following situations: - Explain complex code logic that if you had to scratch your head at, someone else will have to scratch their head at - Explain niche, unique-to-your-business logic -- Separate out field types (ex. Ids, booleans, strings, dates, numerics, and timestamps) in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) to create more readable, organized, and formulaic models +- Separate out field types (ex. Ids, booleans, strings, dates, numerics, and timestamps) in [staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging) to create more readable, organized, and formulaic models - Clearly label tech debt (`-- [TODO]: TECH DEBT`) in queries or models diff --git a/website/docs/sql-reference/statements/sql-select.md b/website/docs/sql-reference/statements/sql-select.md index 49132524096..0b914d9c1da 100644 --- a/website/docs/sql-reference/statements/sql-select.md +++ b/website/docs/sql-reference/statements/sql-select.md @@ -42,8 +42,8 @@ You may also commonly see queries that `select * from table_name`. The asterisk Leverage [dbt utils’ star macro](/blog/star-sql-love-letter) to be able to both easily select many and specifically exclude certain columns. ::: -In a dbt project, analytics engineers will typically write models that contain multiple CTEs that build to one greater query. For folks that are newer to analytics engineering or dbt, we recommend they check out the [“How we structure our dbt projects” guide](/guides/best-practices/how-we-structure/1-guide-overview) to better understand why analytics folks like modular data modeling and CTEs. +In a dbt project, analytics engineers will typically write models that contain multiple CTEs that build to one greater query. For folks that are newer to analytics engineering or dbt, we recommend they check out the [“How we structure our dbt projects” guide](/best-practices/how-we-structure/1-guide-overview) to better understand why analytics folks like modular data modeling and CTEs. ## SELECT statement syntax in Snowflake, Databricks, BigQuery, and Redshift -While we know the data warehouse players like to have their own slightly different flavors and syntax for SQL, they have conferred together that the SELECT statement is sacred and unchangeable. As a result, writing the actual `select…from` statement across Snowflake, Databricks, Google BigQuery, and Amazon Redshift would look the same. However, the actual SQL manipulation of data within the SELECT statement (ex. adding dates, casting columns) might look slightly different between each data warehouse. \ No newline at end of file +While we know the data warehouse players like to have their own slightly different flavors and syntax for SQL, they have conferred together that the SELECT statement is sacred and unchangeable. As a result, writing the actual `select…from` statement across Snowflake, Databricks, Google BigQuery, and Amazon Redshift would look the same. However, the actual SQL manipulation of data within the SELECT statement (ex. adding dates, casting columns) might look slightly different between each data warehouse. diff --git a/website/docs/sql-reference/string-functions/sql-lower.md b/website/docs/sql-reference/string-functions/sql-lower.md index 8c8622bb77a..7b1a5a2c2b3 100644 --- a/website/docs/sql-reference/string-functions/sql-lower.md +++ b/website/docs/sql-reference/string-functions/sql-lower.md @@ -54,7 +54,7 @@ After running this query, the `customers` table will look a little something lik Now, all characters in the `first_name` and `last_name` columns are lowercase. -> Changing all string columns to lowercase to create uniformity across data sources typically happens in our [dbt project’s staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lowercasing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. +> Changing all string columns to lowercase to create uniformity across data sources typically happens in our [dbt project’s staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lowercasing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. ## SQL LOWER function syntax in Snowflake, Databricks, BigQuery, and Redshift diff --git a/website/docs/sql-reference/string-functions/sql-trim.md b/website/docs/sql-reference/string-functions/sql-trim.md index ad54a015437..b9555feb630 100644 --- a/website/docs/sql-reference/string-functions/sql-trim.md +++ b/website/docs/sql-reference/string-functions/sql-trim.md @@ -50,4 +50,4 @@ In this query, you’re adding superfluous asterisks to a string using the [CONC ## TRIM function use cases -If string values in your raw data have extra white spaces or miscellaneous characters, you’ll leverage the TRIM (and subset RTRIM AND LTRIM) functions to help you quickly remove them. You’ll likely do this cleanup in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging), where you’re probably standardizing casing and doing other minor formatting changes to string values, so you can use a clean and consistent format across your downstream models. +If string values in your raw data have extra white spaces or miscellaneous characters, you’ll leverage the TRIM (and subset RTRIM AND LTRIM) functions to help you quickly remove them. You’ll likely do this cleanup in [staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging), where you’re probably standardizing casing and doing other minor formatting changes to string values, so you can use a clean and consistent format across your downstream models. diff --git a/website/docs/sql-reference/string-functions/sql-upper.md b/website/docs/sql-reference/string-functions/sql-upper.md index cf7694f8e46..a505537ac5d 100644 --- a/website/docs/sql-reference/string-functions/sql-upper.md +++ b/website/docs/sql-reference/string-functions/sql-upper.md @@ -46,7 +46,7 @@ After running this query, the `customers` table will look a little something lik Now, all characters in the `first_name` are uppercase (and `last_name` are unchanged). -> Changing string columns to uppercase to create uniformity across data sources typically happens in our [dbt project’s staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lower or upper casing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. +> Changing string columns to uppercase to create uniformity across data sources typically happens in our [dbt project’s staging models](https://docs.getdbt.com/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lower or upper casing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. ## SQL UPPER function syntax in Snowflake, Databricks, BigQuery, and Redshift diff --git a/website/docs/terms/cte.md b/website/docs/terms/cte.md index d4a4bb15915..f67480325b4 100644 --- a/website/docs/terms/cte.md +++ b/website/docs/terms/cte.md @@ -66,7 +66,7 @@ When people talk about how CTEs can simplify your queries, they specifically mea #### Establish Structure -In leveraging CTEs, you can break complex code into smaller segments, ultimately helping provide structure to your code. At dbt Labs, we often like to use the [import, logical, and final structure](/guides/migration/tools/refactoring-legacy-sql#implement-cte-groupings) for CTEs which creates a predictable and organized structure to your dbt models. +In leveraging CTEs, you can break complex code into smaller segments, ultimately helping provide structure to your code. At dbt Labs, we often like to use the [import, logical, and final structure](/guides/refactoring-legacy-sql?step=5#implement-cte-groupings) for CTEs which creates a predictable and organized structure to your dbt models. #### Easily identify dependencies @@ -181,7 +181,7 @@ CTEs are essentially temporary views that can be used throughout a query. They a If you’re interested in reading more about CTE best practices, check out some of our favorite content around model refactoring and style: -- [Refactoring Legacy SQL to dbt](/guides/migration/tools/refactoring-legacy-sql#implement-cte-groupings) +- [Refactoring Legacy SQL to dbt](/guides/refactoring-legacy-sql?step=5#implement-cte-groupings) - [dbt Labs Style Guide](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md#ctes) - [Modular Data Modeling Technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) diff --git a/website/docs/terms/dag.md b/website/docs/terms/dag.md index f4247c785a4..c6b91300bfc 100644 --- a/website/docs/terms/dag.md +++ b/website/docs/terms/dag.md @@ -65,7 +65,7 @@ See the DAG above? It follows a more traditional approach to data modeling where Instead, there are some key elements that can help you create a more streamlined DAG and [modular data models](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/): -- Leveraging [staging, intermediate, and mart layers](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) to create layers of distinction between sources and transformed data +- Leveraging [staging, intermediate, and mart layers](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) to create layers of distinction between sources and transformed data - Abstracting code that’s used across multiple models to its own model - Joining on surrogate keys versus on multiple values @@ -106,6 +106,6 @@ A Directed acyclic graph (DAG) is a visual representation of your data models an Ready to restructure (or create your first) DAG? Check out some of the resources below to better understand data modularity, data lineage, and how dbt helps bring it all together: - [Data modeling techniques for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -- [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) +- [How we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) - [How to audit your DAG](https://www.youtube.com/watch?v=5W6VrnHVkCA) -- [Refactoring legacy SQL to dbt](/guides/migration/tools/refactoring-legacy-sql) +- [Refactoring legacy SQL to dbt](/guides/refactoring-legacy-sql) diff --git a/website/docs/terms/data-catalog.md b/website/docs/terms/data-catalog.md index feb529e82e6..64c6ea6448e 100644 --- a/website/docs/terms/data-catalog.md +++ b/website/docs/terms/data-catalog.md @@ -79,7 +79,7 @@ Data teams may choose to use third-party tools with data cataloging capabilities ## Conclusion -Data catalogs are a valuable asset to any data team and business as a whole. They allow people within an organization to find the data that they need when they need it and understand its quality or sensitivity. This makes communication across teams more seamless, preventing problems that impact the business in the long run. Weigh your options in terms of whether to go with open source and enterprise, trusting that the decision you land on will be best for your organization. +Data catalogs are a valuable asset to any data team and business as a whole. They allow people within an organization to find the data that they need when they need it and understand its quality or sensitivity. This makes communication across teams more seamless, preventing problems that impact the business in the long run. Weigh your options in terms of whether to go with open source or enterprise, trusting that the decision you land on will be best for your organization. ## Additional reading diff --git a/website/docs/terms/data-lineage.md b/website/docs/terms/data-lineage.md index bb3751df3da..d0162c35616 100644 --- a/website/docs/terms/data-lineage.md +++ b/website/docs/terms/data-lineage.md @@ -63,7 +63,7 @@ In the greater data world, you may often hear of data lineage systems based on t If you use a transformation tool such as dbt that automatically infers relationships between data sources and models, a DAG automatically populates to show you the lineage that exists for your [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - + Your is used to visually show upstream dependencies, the nodes that must come before a current model, and downstream relationships, the work that is impacted by the current model. DAGs are also directional—they show a defined flow of movement and form non-cyclical loops. @@ -89,7 +89,7 @@ The biggest challenges around data lineage become more apparent as your data, sy As dbt projects scale with data and organization growth, the number of sources, models, macros, seeds, and [exposures](https://docs.getdbt.com/docs/build/exposures) invariably grow. And with an increasing number of nodes in your DAG, it can become harder to audit your DAG for WET code or inefficiencies. -Working with dbt projects with thousands of models and nodes can feel overwhelming, but remember: your DAG and data lineage are meant to help you, not be your enemy. Tackle DAG audits in chunks, document all models, and [leverage strong structure conventions](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). +Working with dbt projects with thousands of models and nodes can feel overwhelming, but remember: your DAG and data lineage are meant to help you, not be your enemy. Tackle DAG audits in chunks, document all models, and [leverage strong structure conventions](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview). :::tip dbt project evaluator @@ -113,4 +113,4 @@ DAGs, data lineage, and root cause analysis…tell me more! Check out some of ou - [Glossary: DRY](https://docs.getdbt.com/terms/dry) - [Data techniques for modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -- [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) +- [How we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/data-warehouse.md b/website/docs/terms/data-warehouse.md index 2c64fbaa7b7..cf6f5de3d20 100644 --- a/website/docs/terms/data-warehouse.md +++ b/website/docs/terms/data-warehouse.md @@ -75,7 +75,7 @@ Because all of your data is located in the same place, it allows for faster repo ## Data warehouse vs data lake -A data lake is a system where you store, process, and query unstructured, semi-structured, and structured data at almost any scale. The main difference between a data warehouse and a data lake is the type and way data is stored. Data warehouses contain structured data that is meant to organize data for analytics use. Data lakes can contain pretty much any kind of data—structured or unstructured—and data is usually left it its raw format until it's ready to use. Compare that to data warehouses, whose primary goal is to be a place for data teams to store both raw and transformed, usable data. +A data lake is a system where you store, process, and query unstructured, semi-structured, and structured data at almost any scale. The main difference between a data warehouse and a data lake is the type and way data is stored. Data warehouses contain structured data that is meant to organize data for analytics use. Data lakes can contain pretty much any kind of data—structured or unstructured—and data is usually left in its raw format until it's ready to use. Compare that to data warehouses, whose primary goal is to be a place for data teams to store both raw and transformed, usable data. ## Conclusion diff --git a/website/docs/terms/data-wrangling.md b/website/docs/terms/data-wrangling.md index a5b4e99f312..58034fe8e91 100644 --- a/website/docs/terms/data-wrangling.md +++ b/website/docs/terms/data-wrangling.md @@ -12,7 +12,7 @@ hoverSnippet: Data wrangling describes the different processes used to transform Data wrangling describes the different processes used to transform raw data into a consistent and easily usable format. For analytics engineers, you may know this better by the name of data cleaning. In data science or machine learning, "wrangling" often refers to prepping the data for model creation. -The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data in a downstream model without worrying about basic cleaning like renaming, datatype casting, etc. Data wrangling acts as preparation for the development of [intermediate, fct/dim, or mart data models](/guides/best-practices/how-we-structure/1-guide-overview) that form the base layer that other data work can be built off of. Analytics engineers tend to do data wrangling work in the staging layer as a first transformation step after loading the data. This eliminates a foundational step done by an analytics engineer or analyst when building a downstream data model or dashboard. +The ultimate goal of data wrangling is to work in a way that allows you to dive right into analysis on a dataset or build upon that data in a downstream model without worrying about basic cleaning like renaming, datatype casting, etc. Data wrangling acts as preparation for the development of [intermediate, fct/dim, or mart data models](/best-practices/how-we-structure/1-guide-overview) that form the base layer that other data work can be built off of. Analytics engineers tend to do data wrangling work in the staging layer as a first transformation step after loading the data. This eliminates a foundational step done by an analytics engineer or analyst when building a downstream data model or dashboard. ## Data wrangling steps @@ -37,7 +37,6 @@ Structuring your data is a type of transformation that involves reformatting and - Is your data in the format you need to perform analysis on it? Does your data need to be potentially unnested? *Should you nest or objectize columns together?* - Do the column names and values look correct for your use case? -Do the column names and values look correct for your use case? If your data is not in a format that is usable, you can look into different solutions such as pivoting or using different functions to unpack lists and JSON files so that they are in a tabular format. Pivoting is helpful because it allows you to change the way your dataset is structured by rearranging the way columns, rows, and their values are displayed. dbt has a [pre-built macro](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/pivot.sql) that makes pivoting less of a headache and more of a breeze. @@ -164,4 +163,4 @@ You could argue that data wrangling is one of the most important parts of an ana - [Our favorite SQL functions](https://www.getdbt.com/sql-foundations/top-sql-functions/) - [Glossary: Data warehouse](/terms/data-warehouse) - [Glossary: Primary key](/terms/primary-key) -- [Glossary: JSON](/terms/json) \ No newline at end of file +- [Glossary: JSON](/terms/json) diff --git a/website/docs/terms/dimensional-modeling.md b/website/docs/terms/dimensional-modeling.md index d0b5e9384a5..de88f7c318d 100644 --- a/website/docs/terms/dimensional-modeling.md +++ b/website/docs/terms/dimensional-modeling.md @@ -28,7 +28,7 @@ If you run a bakery (and we’d be interested in seeing the data person + baker Just as eating raw flour isn’t that appetizing, neither is deriving insights from raw data since it rarely has a nice structure that makes it poised for analytics. There’s some considerable work that’s needed to organize data and make it usable for business users. -This is where dimensional modeling comes into play; it’s a method that can help data folks create meaningful entities (cupcakes and cookies) to live inside their [data mart](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) (your glass display) and eventually use for business intelligence purposes (eating said cookies). +This is where dimensional modeling comes into play; it’s a method that can help data folks create meaningful entities (cupcakes and cookies) to live inside their [data mart](https://docs.getdbt.com/best-practices/how-we-structure/4-marts) (your glass display) and eventually use for business intelligence purposes (eating said cookies). So I guess we take it back—you’re not just trying to build a bakery, you’re also trying to build a top-notch foundation for meaningful analytics. Dimensional modeling can be a method to get you part of the way there. @@ -135,7 +135,7 @@ If your end data consumers are less comfortable with SQL and your BI tool doesn The benefits and drawbacks of dimensional modeling are pretty straightforward. Generally, the main advantages can be boiled down to: -* **More accessibility**: Since the output of good dimensional modeling is a [data mart](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts), the tables created are easier to understand and more accessible to end consumers. +* **More accessibility**: Since the output of good dimensional modeling is a [data mart](https://docs.getdbt.com/best-practices/how-we-structure/4-marts), the tables created are easier to understand and more accessible to end consumers. * **More flexibility**: Easy to slice, dice, filter, and view your data in whatever way suits your purpose. * **Performance**: Fact and dimension models are typically materialized as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models). Since these often form the core understanding of a business, they are queried often. Materializing them as tables allows them to be more performant in downstream BI platforms. @@ -156,4 +156,4 @@ Dimensional modeling is a tough, complex, and opinionated topic in the data worl * [Modular data modeling techniques](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) * [Stakeholder-friendly model naming conventions](https://docs.getdbt.com/blog/stakeholder-friendly-model-names/) -* [How we structure our dbt projects guide](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) +* [How we structure our dbt projects guide](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) diff --git a/website/docs/terms/dry.md b/website/docs/terms/dry.md index be3d03ed4f0..b1649278cd2 100644 --- a/website/docs/terms/dry.md +++ b/website/docs/terms/dry.md @@ -89,7 +89,7 @@ DRY code is a principle that you should always be striving for. It saves you tim ## Further reading * [Data modeling technique for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) -* [Why we use so many CTEs](https://docs.getdbt.com/docs/guides/best-practices) +* [Why we use so many CTEs](https://docs.getdbt.com/docs/best-practices) * [Glossary: CTE](https://docs.getdbt.com/terms/cte) * [Glossary: Materialization](https://docs.getdbt.com/terms/materialization) * [Glossary: View](https://docs.getdbt.com/terms/view) diff --git a/website/docs/terms/idempotent.md b/website/docs/terms/idempotent.md index 8772ba58b62..ea3ef0a099b 100644 --- a/website/docs/terms/idempotent.md +++ b/website/docs/terms/idempotent.md @@ -20,4 +20,4 @@ A non-idempotent version of the "_Save_" button might do something like "Append If word processors only gave us non-idempotent "Append paragraph" / "Update paragraph" / "Delete paragraph" operations, then saving our document changes would be a lot more difficult! We'd have to keep track of which paragraphs we previously saved, and either make sure to not save them again or have a process in place to regularly clean up duplicate paragraphs. The implementation of the "_Save_" button in word processors takes the collection of low-level non-idempotent filesystem operations (read/append/overwrite/delete), and systematically runs them in a certain order so that the _user_ doesn't have to deal with the non-idempotency. The user can just focus on writing -- choosing words, editing for clarity, ensuring paragraphs aren't too long, etc. -- and the word processor deals with making sure the words get persisted properly to disk. -This word processing analogy is very similar to what dbt does for [data transformation](https://www.getdbt.com/analytics-engineering/transformation/): it takes the collection of low-level non-idempotent database operations (`SELECT`/`INSERT`/`UPDATE`/`DELETE` -- collectively known as DML statements), and systematically runs them in a certain order so that analytics engineers don't have to deal with non-idempotency. We can just focus on the data -- [choosing good model and column names](https://docs.getdbt.com/blog/on-the-importance-of-naming), [documenting them](/community/resources/viewpoint#documentation), [ensuring data consumers can understand them](https://docs.getdbt.com/docs/guides/best-practices#consider-the-information-architecture-of-your-data-warehouse), etc. -- and [`dbt run`](https://docs.getdbt.com/reference/commands/run) will make sure the database ends up in the right state. +This word processing analogy is very similar to what dbt does for [data transformation](https://www.getdbt.com/analytics-engineering/transformation/): it takes the collection of low-level non-idempotent database operations (`SELECT`/`INSERT`/`UPDATE`/`DELETE` -- collectively known as DML statements), and systematically runs them in a certain order so that analytics engineers don't have to deal with non-idempotency. We can just focus on the data -- [choosing good model and column names](https://docs.getdbt.com/blog/on-the-importance-of-naming), [documenting them](/community/resources/viewpoint#documentation), [ensuring data consumers can understand them](https://docs.getdbt.com/docs/best-practices#consider-the-information-architecture-of-your-data-warehouse), etc. -- and [`dbt run`](https://docs.getdbt.com/reference/commands/run) will make sure the database ends up in the right state. diff --git a/website/docs/terms/materialization.md b/website/docs/terms/materialization.md index fdeaaebfcc8..328076f1483 100644 --- a/website/docs/terms/materialization.md +++ b/website/docs/terms/materialization.md @@ -11,7 +11,7 @@ hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when cr :::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! +This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! ::: The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a . It's the manner in which the data is represented, and each of those options is defined either canonically (tables, views, incremental), or bespoke. diff --git a/website/docs/terms/model.md b/website/docs/terms/model.md new file mode 100644 index 00000000000..c589cc196a7 --- /dev/null +++ b/website/docs/terms/model.md @@ -0,0 +1,9 @@ +--- +id: model +title: Model +description: A model is an essential building block of the DAG +displayText: model +hoverSnippet: A model is an essential building block of the DAG +--- + +A model is an essential building block of the DAG that lives in a single file and contains logic that transforms data. This logic can be expressed as a SQL `select` statement or a Python dataframe operation. Models can be materialized in the warehouse in different ways — most of these materializations require models to be built in the warehouse. \ No newline at end of file diff --git a/website/docs/terms/monotonically-increasing.md b/website/docs/terms/monotonically-increasing.md index 397e333942a..b4e3987995d 100644 --- a/website/docs/terms/monotonically-increasing.md +++ b/website/docs/terms/monotonically-increasing.md @@ -1,11 +1,11 @@ --- id: monotonically-increasing title: Monotonically increasing -description: Monotonicity means unchanging (think monotone). A monotonically-increasing value is a value which increases at a constant rate, for example the values 1, 2, 3, 4. +description: A monotonically increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. displayText: monotonically increasing -hoverSnippet: Monotonicity means unchanging (think monotone). A monotonically-increasing value is a value which increases at a constant rate, for example the values 1, 2, 3, 4. +hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. --- -Monotonicity means unchanging (think monotone). A monotonically-increasing value is a value which increases at a constant rate, for example the values `[1, 2, 3, 4]`. +Monotonicity means unchanging (think monotone); a monotonic sequence is a sequence where the order of the value of the elements does not change. In other words, a monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example the sequences `[1, 6, 7, 11, 131]` or `[2, 5, 5, 5, 6, 10]`.. -Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. \ No newline at end of file +Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. diff --git a/website/docs/terms/predicate-pushdown.md b/website/docs/terms/predicate-pushdown.md new file mode 100644 index 00000000000..8e9bad85b6b --- /dev/null +++ b/website/docs/terms/predicate-pushdown.md @@ -0,0 +1,10 @@ +--- +id: predicate-pushdown +title: predicate pushdown +description: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query +displayText: Predicate pushdown +hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query +--- + +A predicate pushdown is an expression used to determine what rows in a database apply to a particular query. For example, if you filter in a `WHERE` clause based on a specific dimension value, the database searches to determine what values in the database apply to the query. The optimization known as "predicate pushdown" involves applying this filtering process to the database, leading to enhanced and faster query performance. + diff --git a/website/docs/terms/primary-key.md b/website/docs/terms/primary-key.md index 5921d3ca655..4acd1e8c46d 100644 --- a/website/docs/terms/primary-key.md +++ b/website/docs/terms/primary-key.md @@ -73,7 +73,7 @@ The table below gives an overview of primary key support and enforcement in some
      - diff --git a/website/docs/terms/table.md b/website/docs/terms/table.md index 69fc2b3e6b6..cbe36ec1315 100644 --- a/website/docs/terms/table.md +++ b/website/docs/terms/table.md @@ -6,7 +6,7 @@ displayText: table hoverSnippet: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. --- :::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! +This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! ::: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. diff --git a/website/docs/terms/view.md b/website/docs/terms/view.md index 5d9238256e0..53c122ca9e6 100644 --- a/website/docs/terms/view.md +++ b/website/docs/terms/view.md @@ -6,7 +6,7 @@ displayText: view hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). --- :::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! +This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! ::: A view (as opposed to a ) is a defined passthrough SQL query that can be run against a database (or ). A view doesn’t store data, like a table does, but it defines the logic that you need to fetch the underlying data. @@ -33,4 +33,4 @@ You shouldn’t expect a view in itself to be your final destination in terms of ## Further reading -- [Best practices guide on choosing table vs view materializations](/guides/best-practices) +- [Best practices guide on choosing table vs view materializations](/best-practices) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index af285961145..c753b854e53 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -1,6 +1,7 @@ const path = require("path"); const math = require("remark-math"); const katex = require("rehype-katex"); + const { versions, versionedPages, versionedCategories } = require("./dbt-versions"); require("dotenv").config(); @@ -51,6 +52,7 @@ var siteSettings = { docs:{ sidebar: { hideable: true, + autoCollapseCategories: true, }, }, image: "/img/avatar.png", @@ -70,16 +72,16 @@ var siteSettings = { announcementBar: { id: "biweekly-demos", content: - "Join our weekly demos and see dbt Cloud in action!", + "Join our weekly demos and dbt Cloud in action!", backgroundColor: "#047377", textColor: "#fff", isCloseable: true, }, announcementBarActive: true, - announcementBarLink: "https://www.getdbt.com/resources/dbt-cloud-demos-with-experts/?utm_medium=event&utm_source=docs&utm_campaign=q1-2024_cloud-demos-with-experts_awareness", + announcementBarLink: "https://www.getdbt.com/resources/dbt-cloud-demos-with-experts?utm_source=docs&utm_medium=event&utm_campaign=q1-2024_cloud-demos-with-experts_awareness", // Set community spotlight member on homepage // This is the ID for a specific file under docs/community/spotlight - communitySpotlightMember: "david-effiong", + communitySpotlightMember: "faith-lierheimer", prism: { theme: (() => { var theme = require("prism-react-renderer/themes/nightOwl"); @@ -129,12 +131,12 @@ var siteSettings = { href: 'https://courses.getdbt.com', }, { - label: 'Guides', - to: '/guides/best-practices', + label: 'Best Practices', + to: '/best-practices', }, { - label: "Quickstarts", - to: "/quickstarts", + label: "Guides", + to: "/guides", }, { label: "Developer Blog", @@ -192,7 +194,7 @@ var siteSettings = { @@ -257,6 +259,8 @@ var siteSettings = { src: "https://cdn.jsdelivr.net/npm/featherlight@1.7.14/release/featherlight.min.js", defer: true, }, + "https://cdn.jsdelivr.net/npm/clipboard@2.0.11/dist/clipboard.min.js", + "/js/headerLinkCopy.js", "/js/gtm.js", "/js/onetrust.js", "https://kit.fontawesome.com/7110474d41.js", diff --git a/website/functions/image-cache-wrapper.js b/website/functions/image-cache-wrapper.js new file mode 100644 index 00000000000..aad2ffff200 --- /dev/null +++ b/website/functions/image-cache-wrapper.js @@ -0,0 +1,12 @@ +// This function is used to break the cache on images +// preventing stale or broken images from being served + +const CACHE_VERSION = '2' + +export default function imageCacheWrapper(src) { + const cacheParam = `?v=${CACHE_VERSION}` + + return ( + src + cacheParam + ) +} diff --git a/website/package-lock.json b/website/package-lock.json index b15a903e97f..282056e5922 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -36,6 +36,7 @@ "react-dom": "^17.0.1", "react-full-screen": "^1.1.1", "react-is": "^18.1.0", + "react-select": "^5.7.5", "react-tooltip": "^4.2.21", "redoc": "^2.0.0-rc.57", "rehype-katex": "^5.0.0", @@ -3098,6 +3099,59 @@ "node": ">=12" } }, + "node_modules/@emotion/babel-plugin": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/babel-plugin/-/babel-plugin-11.11.0.tgz", + "integrity": "sha512-m4HEDZleaaCH+XgDDsPF15Ht6wTLsgDTeR3WYj9Q/k76JtWhrJjcP4+/XlG8LGT/Rol9qUfOIztXeA84ATpqPQ==", + "dependencies": { + "@babel/helper-module-imports": "^7.16.7", + "@babel/runtime": "^7.18.3", + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/serialize": "^1.1.2", + "babel-plugin-macros": "^3.1.0", + "convert-source-map": "^1.5.0", + "escape-string-regexp": "^4.0.0", + "find-root": "^1.1.0", + "source-map": "^0.5.7", + "stylis": "4.2.0" + } + }, + "node_modules/@emotion/babel-plugin/node_modules/@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "node_modules/@emotion/babel-plugin/node_modules/source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha512-LbrmJOMUSdEVxIKvdcJzQC+nQhe8FUZQTXQy6+I75skNgn3OoQ0DZA8YnFa7gp8tqtL3KPf1kmo0R5DoApeSGQ==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/@emotion/cache": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/cache/-/cache-11.11.0.tgz", + "integrity": "sha512-P34z9ssTCBi3e9EI1ZsWpNHcfY1r09ZO0rZbRO2ob3ZQMnFI35jB536qoXbkdesr5EUhYi22anuEJuyxifaqAQ==", + "dependencies": { + "@emotion/memoize": "^0.8.1", + "@emotion/sheet": "^1.2.2", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "stylis": "4.2.0" + } + }, + "node_modules/@emotion/cache/node_modules/@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "node_modules/@emotion/hash": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.9.1.tgz", + "integrity": "sha512-gJB6HLm5rYwSLI6PQa+X1t5CFGrv1J1TWG+sOyMCeKz2ojaj6Fnl/rZEspogG+cvqbt4AE/2eIyD2QfLKTBNlQ==" + }, "node_modules/@emotion/is-prop-valid": { "version": "0.8.8", "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-0.8.8.tgz", @@ -3111,6 +3165,56 @@ "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.7.4.tgz", "integrity": "sha512-Ja/Vfqe3HpuzRsG1oBtWTHk2PGZ7GR+2Vz5iYGelAw8dx32K0y7PjVuxK6z1nMpZOqAFsRUPCkK1YjJ56qJlgw==" }, + "node_modules/@emotion/react": { + "version": "11.11.1", + "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.11.1.tgz", + "integrity": "sha512-5mlW1DquU5HaxjLkfkGN1GA/fvVGdyHURRiX/0FHl2cfIfRxSOfmxEH5YS43edp0OldZrZ+dkBKbngxcNCdZvA==", + "dependencies": { + "@babel/runtime": "^7.18.3", + "@emotion/babel-plugin": "^11.11.0", + "@emotion/cache": "^11.11.0", + "@emotion/serialize": "^1.1.2", + "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "hoist-non-react-statics": "^3.3.1" + }, + "peerDependencies": { + "react": ">=16.8.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + } + } + }, + "node_modules/@emotion/serialize": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@emotion/serialize/-/serialize-1.1.2.tgz", + "integrity": "sha512-zR6a/fkFP4EAcCMQtLOhIgpprZOwNmCldtpaISpvz348+DP4Mz8ZoKaGGCQpbzepNIUWbq4w6hNZkwDyKoS+HA==", + "dependencies": { + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/unitless": "^0.8.1", + "@emotion/utils": "^1.2.1", + "csstype": "^3.0.2" + } + }, + "node_modules/@emotion/serialize/node_modules/@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "node_modules/@emotion/serialize/node_modules/@emotion/unitless": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz", + "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==" + }, + "node_modules/@emotion/sheet": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@emotion/sheet/-/sheet-1.2.2.tgz", + "integrity": "sha512-0QBtGvaqtWi+nx6doRwDdBIzhNdZrXUppvTM4dtZZWEGTXL/XE/yJxLMGlDT1Gt+UHH5IX1n+jkXyytE/av7OA==" + }, "node_modules/@emotion/stylis": { "version": "0.8.5", "resolved": "https://registry.npmjs.org/@emotion/stylis/-/stylis-0.8.5.tgz", @@ -3121,6 +3225,24 @@ "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.7.5.tgz", "integrity": "sha512-OWORNpfjMsSSUBVrRBVGECkhWcULOAJz9ZW8uK9qgxD+87M7jHRcvh/A96XXNhXTLmKcoYSQtBEX7lHMO7YRwg==" }, + "node_modules/@emotion/use-insertion-effect-with-fallbacks": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@emotion/use-insertion-effect-with-fallbacks/-/use-insertion-effect-with-fallbacks-1.0.1.tgz", + "integrity": "sha512-jT/qyKZ9rzLErtrjGgdkMBn2OP8wl0G3sQlBb3YPryvKHsjvINUhVaPFfP+fpBcOkmrVOVEEHQFJ7nbj2TH2gw==", + "peerDependencies": { + "react": ">=16.8.0" + } + }, + "node_modules/@emotion/utils": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emotion/utils/-/utils-1.2.1.tgz", + "integrity": "sha512-Y2tGf3I+XVnajdItskUCn6LX+VUDmP6lTL4fcqsXAv43dnlbZiuW4MWQW38rW/BVWSE7Q/7+XQocmpnRYILUmg==" + }, + "node_modules/@emotion/weak-memoize": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/@emotion/weak-memoize/-/weak-memoize-0.3.1.tgz", + "integrity": "sha512-EsBwpc7hBUJWAsNPBmJy4hxWx12v6bshQsldrVmjxJoc3isbxhOrF2IcCpaXxfvq03NwkI7sbsOLXbYuqF/8Ww==" + }, "node_modules/@endiliey/react-ideal-image": { "version": "0.0.11", "resolved": "https://registry.npmjs.org/@endiliey/react-ideal-image/-/react-ideal-image-0.0.11.tgz", @@ -3204,6 +3326,28 @@ "resolved": "https://registry.npmjs.org/@faker-js/faker/-/faker-5.5.3.tgz", "integrity": "sha512-R11tGE6yIFwqpaIqcfkcg7AICXzFg14+5h5v0TfF/9+RMDL6jhzCy/pxHVOfbALGdtVYdt6JdR21tuxEgl34dw==" }, + "node_modules/@floating-ui/core": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.5.0.tgz", + "integrity": "sha512-kK1h4m36DQ0UHGj5Ah4db7R0rHemTqqO0QLvUqi1/mUUp3LuAWbWxdxSIf/XsnH9VS6rRVPLJCncjRzUvyCLXg==", + "dependencies": { + "@floating-ui/utils": "^0.1.3" + } + }, + "node_modules/@floating-ui/dom": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.5.3.tgz", + "integrity": "sha512-ClAbQnEqJAKCJOEbbLo5IUlZHkNszqhuxS4fHAVxRPXPya6Ysf2G8KypnYcOTpx6I8xcgF9bbHb6g/2KpbV8qA==", + "dependencies": { + "@floating-ui/core": "^1.4.2", + "@floating-ui/utils": "^0.1.3" + } + }, + "node_modules/@floating-ui/utils": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.1.4.tgz", + "integrity": "sha512-qprfWkn82Iw821mcKofJ5Pk9wgioHicxcQMxx+5zt5GSKoqdWvgG5AxVmpmUUjzTLPVSH5auBrhI93Deayn/DA==" + }, "node_modules/@fortawesome/fontawesome-common-types": { "version": "6.4.0", "resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-common-types/-/fontawesome-common-types-6.4.0.tgz", @@ -6820,6 +6964,14 @@ "@types/react-router": "*" } }, + "node_modules/@types/react-transition-group": { + "version": "4.4.7", + "resolved": "https://registry.npmjs.org/@types/react-transition-group/-/react-transition-group-4.4.7.tgz", + "integrity": "sha512-ICCyBl5mvyqYp8Qeq9B5G/fyBSRC0zx3XM3sCC6KkcMsNeAHqXBKkmat4GqdJET5jtYUpZXrxI5flve5qhi2Eg==", + "dependencies": { + "@types/react": "*" + } + }, "node_modules/@types/resize-observer-browser": { "version": "0.1.7", "resolved": "https://registry.npmjs.org/@types/resize-observer-browser/-/resize-observer-browser-0.1.7.tgz", @@ -7981,6 +8133,20 @@ "node": "^10.13.0 || ^12.13.0 || ^14.15.0 || >=15.0.0" } }, + "node_modules/babel-plugin-macros": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/babel-plugin-macros/-/babel-plugin-macros-3.1.0.tgz", + "integrity": "sha512-Cg7TFGpIr01vOQNODXOOaGz2NpCU5gl8x1qJFbb6hbZxR7XrcE2vtbAsTAbJ7/xwJtUuJEw8K8Zr/AE0LHlesg==", + "dependencies": { + "@babel/runtime": "^7.12.5", + "cosmiconfig": "^7.0.0", + "resolve": "^1.19.0" + }, + "engines": { + "node": ">=10", + "npm": ">=6" + } + }, "node_modules/babel-plugin-polyfill-corejs2": { "version": "0.3.3", "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.3.3.tgz", @@ -12111,6 +12277,11 @@ "url": "https://github.com/avajs/find-cache-dir?sponsor=1" } }, + "node_modules/find-root": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", + "integrity": "sha512-NKfW6bec6GfKc0SGx1e07QZY9PE99u0Bft/0rzSD5k3sO/vwkVUpDUKVm5Gpp5Ue3YfShPFTX2070tDs5kB9Ng==" + }, "node_modules/find-up": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-6.3.0.tgz", @@ -14081,9 +14252,9 @@ } }, "node_modules/is-core-module": { - "version": "2.11.0", - "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.11.0.tgz", - "integrity": "sha512-RRjxlvLDkD1YJwDbroBHMb+cukurkDWNyHx7D3oNB5x9rb5ogcksMC5wHCadcXoo67gVr/+3GFySh3134zi6rw==", + "version": "2.13.0", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.0.tgz", + "integrity": "sha512-Z7dk6Qo8pOCp3l4tsX2C5ZVas4V+UxwQodwZhLopL91TX8UyyHEXafPcyoeeWuLrwzHcr3igO78wNLwHJHsMCQ==", "dependencies": { "has": "^1.0.3" }, @@ -17456,6 +17627,11 @@ "node": ">= 4.0.0" } }, + "node_modules/memoize-one": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/memoize-one/-/memoize-one-6.0.0.tgz", + "integrity": "sha512-rkpe71W0N0c0Xz6QD0eJETuWAJGnJ9afsl1srmwPrI+yBCkge5EycXXbYRyvL29zZVUWQCY7InPRCv3GDXuZNw==" + }, "node_modules/merge-descriptors": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", @@ -17898,9 +18074,15 @@ } }, "node_modules/nanoid": { - "version": "3.3.4", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz", - "integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==", + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz", + "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], "bin": { "nanoid": "bin/nanoid.cjs" }, @@ -19034,9 +19216,9 @@ } }, "node_modules/postcss": { - "version": "8.4.21", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.21.tgz", - "integrity": "sha512-tP7u/Sn/dVxK2NnruI4H9BG+x+Wxz6oeZ1cJ8P6G/PZY0IKk4k/63TDsQf2kQq3+qoJeLm2kIBUNlZe3zgb4Zg==", + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", "funding": [ { "type": "opencollective", @@ -19045,10 +19227,14 @@ { "type": "tidelift", "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" } ], "dependencies": { - "nanoid": "^3.3.4", + "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" }, @@ -20644,6 +20830,26 @@ "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz", "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==" }, + "node_modules/react-select": { + "version": "5.7.5", + "resolved": "https://registry.npmjs.org/react-select/-/react-select-5.7.5.tgz", + "integrity": "sha512-jgYZa2xgKP0DVn5GZk7tZwbRx7kaVz1VqU41S8z1KWmshRDhlrpKS0w80aS1RaK5bVIXpttgSou7XCjWw1ncKA==", + "dependencies": { + "@babel/runtime": "^7.12.0", + "@emotion/cache": "^11.4.0", + "@emotion/react": "^11.8.1", + "@floating-ui/dom": "^1.0.1", + "@types/react-transition-group": "^4.4.0", + "memoize-one": "^6.0.0", + "prop-types": "^15.6.0", + "react-transition-group": "^4.3.0", + "use-isomorphic-layout-effect": "^1.1.2" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0" + } + }, "node_modules/react-tabs": { "version": "3.2.3", "resolved": "https://registry.npmjs.org/react-tabs/-/react-tabs-3.2.3.tgz", @@ -20696,6 +20902,30 @@ "uuid": "dist/bin/uuid" } }, + "node_modules/react-transition-group": { + "version": "4.4.5", + "resolved": "https://registry.npmjs.org/react-transition-group/-/react-transition-group-4.4.5.tgz", + "integrity": "sha512-pZcd1MCJoiKiBR2NRxeCRg13uCXbydPnmB4EOeRrY7480qNWO8IIgQG6zlDkm6uRMsURXPuKq0GWtiM59a5Q6g==", + "dependencies": { + "@babel/runtime": "^7.5.5", + "dom-helpers": "^5.0.1", + "loose-envify": "^1.4.0", + "prop-types": "^15.6.2" + }, + "peerDependencies": { + "react": ">=16.6.0", + "react-dom": ">=16.6.0" + } + }, + "node_modules/react-transition-group/node_modules/dom-helpers": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/dom-helpers/-/dom-helpers-5.2.1.tgz", + "integrity": "sha512-nRCa7CK3VTrM2NmGkIy4cbK7IZlgBE/PYMn55rrXefr5xXDP0LdtfPnblFDoVdcAfslJ7or6iqAUnx0CCGIWQA==", + "dependencies": { + "@babel/runtime": "^7.8.7", + "csstype": "^3.0.2" + } + }, "node_modules/react-universal-interface": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/react-universal-interface/-/react-universal-interface-0.6.2.tgz", @@ -21378,11 +21608,11 @@ "integrity": "sha512-LwZrotdHOo12nQuZlHEmtuXdqGoOD0OhaxopaNFxWzInpEgaLWoVuAMbTzixuosCx2nEG58ngzW3vxdWoxIgdg==" }, "node_modules/resolve": { - "version": "1.22.1", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.1.tgz", - "integrity": "sha512-nBpuuYuY5jFsli/JIs1oldw6fOQCBioohqWZg/2hiaOybXOft4lonv85uDOKXdf8rhyK159cxU5cDcK/NKk8zw==", + "version": "1.22.6", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.6.tgz", + "integrity": "sha512-njhxM7mV12JfufShqGy3Rz8j11RPdLy4xi15UurGJeoHLfJpVXKdh3ueuOqbYUcDZnffr6X739JBo5LzyahEsw==", "dependencies": { - "is-core-module": "^2.9.0", + "is-core-module": "^2.13.0", "path-parse": "^1.0.7", "supports-preserve-symlinks-flag": "^1.0.0" }, @@ -27423,6 +27653,60 @@ "tslib": "^2.4.0" } }, + "@emotion/babel-plugin": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/babel-plugin/-/babel-plugin-11.11.0.tgz", + "integrity": "sha512-m4HEDZleaaCH+XgDDsPF15Ht6wTLsgDTeR3WYj9Q/k76JtWhrJjcP4+/XlG8LGT/Rol9qUfOIztXeA84ATpqPQ==", + "requires": { + "@babel/helper-module-imports": "^7.16.7", + "@babel/runtime": "^7.18.3", + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/serialize": "^1.1.2", + "babel-plugin-macros": "^3.1.0", + "convert-source-map": "^1.5.0", + "escape-string-regexp": "^4.0.0", + "find-root": "^1.1.0", + "source-map": "^0.5.7", + "stylis": "4.2.0" + }, + "dependencies": { + "@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha512-LbrmJOMUSdEVxIKvdcJzQC+nQhe8FUZQTXQy6+I75skNgn3OoQ0DZA8YnFa7gp8tqtL3KPf1kmo0R5DoApeSGQ==" + } + } + }, + "@emotion/cache": { + "version": "11.11.0", + "resolved": "https://registry.npmjs.org/@emotion/cache/-/cache-11.11.0.tgz", + "integrity": "sha512-P34z9ssTCBi3e9EI1ZsWpNHcfY1r09ZO0rZbRO2ob3ZQMnFI35jB536qoXbkdesr5EUhYi22anuEJuyxifaqAQ==", + "requires": { + "@emotion/memoize": "^0.8.1", + "@emotion/sheet": "^1.2.2", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "stylis": "4.2.0" + }, + "dependencies": { + "@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + } + } + }, + "@emotion/hash": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/@emotion/hash/-/hash-0.9.1.tgz", + "integrity": "sha512-gJB6HLm5rYwSLI6PQa+X1t5CFGrv1J1TWG+sOyMCeKz2ojaj6Fnl/rZEspogG+cvqbt4AE/2eIyD2QfLKTBNlQ==" + }, "@emotion/is-prop-valid": { "version": "0.8.8", "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-0.8.8.tgz", @@ -27436,6 +27720,50 @@ "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.7.4.tgz", "integrity": "sha512-Ja/Vfqe3HpuzRsG1oBtWTHk2PGZ7GR+2Vz5iYGelAw8dx32K0y7PjVuxK6z1nMpZOqAFsRUPCkK1YjJ56qJlgw==" }, + "@emotion/react": { + "version": "11.11.1", + "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.11.1.tgz", + "integrity": "sha512-5mlW1DquU5HaxjLkfkGN1GA/fvVGdyHURRiX/0FHl2cfIfRxSOfmxEH5YS43edp0OldZrZ+dkBKbngxcNCdZvA==", + "requires": { + "@babel/runtime": "^7.18.3", + "@emotion/babel-plugin": "^11.11.0", + "@emotion/cache": "^11.11.0", + "@emotion/serialize": "^1.1.2", + "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", + "@emotion/utils": "^1.2.1", + "@emotion/weak-memoize": "^0.3.1", + "hoist-non-react-statics": "^3.3.1" + } + }, + "@emotion/serialize": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@emotion/serialize/-/serialize-1.1.2.tgz", + "integrity": "sha512-zR6a/fkFP4EAcCMQtLOhIgpprZOwNmCldtpaISpvz348+DP4Mz8ZoKaGGCQpbzepNIUWbq4w6hNZkwDyKoS+HA==", + "requires": { + "@emotion/hash": "^0.9.1", + "@emotion/memoize": "^0.8.1", + "@emotion/unitless": "^0.8.1", + "@emotion/utils": "^1.2.1", + "csstype": "^3.0.2" + }, + "dependencies": { + "@emotion/memoize": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz", + "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" + }, + "@emotion/unitless": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz", + "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==" + } + } + }, + "@emotion/sheet": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/@emotion/sheet/-/sheet-1.2.2.tgz", + "integrity": "sha512-0QBtGvaqtWi+nx6doRwDdBIzhNdZrXUppvTM4dtZZWEGTXL/XE/yJxLMGlDT1Gt+UHH5IX1n+jkXyytE/av7OA==" + }, "@emotion/stylis": { "version": "0.8.5", "resolved": "https://registry.npmjs.org/@emotion/stylis/-/stylis-0.8.5.tgz", @@ -27446,6 +27774,22 @@ "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.7.5.tgz", "integrity": "sha512-OWORNpfjMsSSUBVrRBVGECkhWcULOAJz9ZW8uK9qgxD+87M7jHRcvh/A96XXNhXTLmKcoYSQtBEX7lHMO7YRwg==" }, + "@emotion/use-insertion-effect-with-fallbacks": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@emotion/use-insertion-effect-with-fallbacks/-/use-insertion-effect-with-fallbacks-1.0.1.tgz", + "integrity": "sha512-jT/qyKZ9rzLErtrjGgdkMBn2OP8wl0G3sQlBb3YPryvKHsjvINUhVaPFfP+fpBcOkmrVOVEEHQFJ7nbj2TH2gw==", + "requires": {} + }, + "@emotion/utils": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emotion/utils/-/utils-1.2.1.tgz", + "integrity": "sha512-Y2tGf3I+XVnajdItskUCn6LX+VUDmP6lTL4fcqsXAv43dnlbZiuW4MWQW38rW/BVWSE7Q/7+XQocmpnRYILUmg==" + }, + "@emotion/weak-memoize": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/@emotion/weak-memoize/-/weak-memoize-0.3.1.tgz", + "integrity": "sha512-EsBwpc7hBUJWAsNPBmJy4hxWx12v6bshQsldrVmjxJoc3isbxhOrF2IcCpaXxfvq03NwkI7sbsOLXbYuqF/8Ww==" + }, "@endiliey/react-ideal-image": { "version": "0.0.11", "resolved": "https://registry.npmjs.org/@endiliey/react-ideal-image/-/react-ideal-image-0.0.11.tgz", @@ -27502,6 +27846,28 @@ "resolved": "https://registry.npmjs.org/@faker-js/faker/-/faker-5.5.3.tgz", "integrity": "sha512-R11tGE6yIFwqpaIqcfkcg7AICXzFg14+5h5v0TfF/9+RMDL6jhzCy/pxHVOfbALGdtVYdt6JdR21tuxEgl34dw==" }, + "@floating-ui/core": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.5.0.tgz", + "integrity": "sha512-kK1h4m36DQ0UHGj5Ah4db7R0rHemTqqO0QLvUqi1/mUUp3LuAWbWxdxSIf/XsnH9VS6rRVPLJCncjRzUvyCLXg==", + "requires": { + "@floating-ui/utils": "^0.1.3" + } + }, + "@floating-ui/dom": { + "version": "1.5.3", + "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.5.3.tgz", + "integrity": "sha512-ClAbQnEqJAKCJOEbbLo5IUlZHkNszqhuxS4fHAVxRPXPya6Ysf2G8KypnYcOTpx6I8xcgF9bbHb6g/2KpbV8qA==", + "requires": { + "@floating-ui/core": "^1.4.2", + "@floating-ui/utils": "^0.1.3" + } + }, + "@floating-ui/utils": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.1.4.tgz", + "integrity": "sha512-qprfWkn82Iw821mcKofJ5Pk9wgioHicxcQMxx+5zt5GSKoqdWvgG5AxVmpmUUjzTLPVSH5auBrhI93Deayn/DA==" + }, "@fortawesome/fontawesome-common-types": { "version": "6.4.0", "resolved": "https://registry.npmjs.org/@fortawesome/fontawesome-common-types/-/fontawesome-common-types-6.4.0.tgz", @@ -30317,6 +30683,14 @@ "@types/react-router": "*" } }, + "@types/react-transition-group": { + "version": "4.4.7", + "resolved": "https://registry.npmjs.org/@types/react-transition-group/-/react-transition-group-4.4.7.tgz", + "integrity": "sha512-ICCyBl5mvyqYp8Qeq9B5G/fyBSRC0zx3XM3sCC6KkcMsNeAHqXBKkmat4GqdJET5jtYUpZXrxI5flve5qhi2Eg==", + "requires": { + "@types/react": "*" + } + }, "@types/resize-observer-browser": { "version": "0.1.7", "resolved": "https://registry.npmjs.org/@types/resize-observer-browser/-/resize-observer-browser-0.1.7.tgz", @@ -31224,6 +31598,16 @@ "@types/babel__traverse": "^7.0.6" } }, + "babel-plugin-macros": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/babel-plugin-macros/-/babel-plugin-macros-3.1.0.tgz", + "integrity": "sha512-Cg7TFGpIr01vOQNODXOOaGz2NpCU5gl8x1qJFbb6hbZxR7XrcE2vtbAsTAbJ7/xwJtUuJEw8K8Zr/AE0LHlesg==", + "requires": { + "@babel/runtime": "^7.12.5", + "cosmiconfig": "^7.0.0", + "resolve": "^1.19.0" + } + }, "babel-plugin-polyfill-corejs2": { "version": "0.3.3", "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.3.3.tgz", @@ -34367,6 +34751,11 @@ "pkg-dir": "^4.1.0" } }, + "find-root": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/find-root/-/find-root-1.1.0.tgz", + "integrity": "sha512-NKfW6bec6GfKc0SGx1e07QZY9PE99u0Bft/0rzSD5k3sO/vwkVUpDUKVm5Gpp5Ue3YfShPFTX2070tDs5kB9Ng==" + }, "find-up": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/find-up/-/find-up-6.3.0.tgz", @@ -35795,9 +36184,9 @@ } }, "is-core-module": { - "version": "2.11.0", - "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.11.0.tgz", - "integrity": "sha512-RRjxlvLDkD1YJwDbroBHMb+cukurkDWNyHx7D3oNB5x9rb5ogcksMC5wHCadcXoo67gVr/+3GFySh3134zi6rw==", + "version": "2.13.0", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.0.tgz", + "integrity": "sha512-Z7dk6Qo8pOCp3l4tsX2C5ZVas4V+UxwQodwZhLopL91TX8UyyHEXafPcyoeeWuLrwzHcr3igO78wNLwHJHsMCQ==", "requires": { "has": "^1.0.3" } @@ -38307,6 +38696,11 @@ "fs-monkey": "^1.0.3" } }, + "memoize-one": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/memoize-one/-/memoize-one-6.0.0.tgz", + "integrity": "sha512-rkpe71W0N0c0Xz6QD0eJETuWAJGnJ9afsl1srmwPrI+yBCkge5EycXXbYRyvL29zZVUWQCY7InPRCv3GDXuZNw==" + }, "merge-descriptors": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-1.0.1.tgz", @@ -38605,9 +38999,9 @@ } }, "nanoid": { - "version": "3.3.4", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.4.tgz", - "integrity": "sha512-MqBkQh/OHTS2egovRtLk45wEyNXwF+cokD+1YPf9u5VfJiRdAiRwB2froX5Co9Rh20xs4siNPm8naNotSD6RBw==" + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.6.tgz", + "integrity": "sha512-BGcqMMJuToF7i1rt+2PWSNVnWIkGCU78jBG3RxO/bZlnZPK2Cmi2QaffxGO/2RvWi9sL+FAiRiXMgsyxQ1DIDA==" }, "napi-build-utils": { "version": "1.0.2", @@ -39455,11 +39849,11 @@ } }, "postcss": { - "version": "8.4.21", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.21.tgz", - "integrity": "sha512-tP7u/Sn/dVxK2NnruI4H9BG+x+Wxz6oeZ1cJ8P6G/PZY0IKk4k/63TDsQf2kQq3+qoJeLm2kIBUNlZe3zgb4Zg==", + "version": "8.4.31", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz", + "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==", "requires": { - "nanoid": "^3.3.4", + "nanoid": "^3.3.6", "picocolors": "^1.0.0", "source-map-js": "^1.0.2" } @@ -40579,6 +40973,22 @@ "prop-types": "^15.7.2" } }, + "react-select": { + "version": "5.7.5", + "resolved": "https://registry.npmjs.org/react-select/-/react-select-5.7.5.tgz", + "integrity": "sha512-jgYZa2xgKP0DVn5GZk7tZwbRx7kaVz1VqU41S8z1KWmshRDhlrpKS0w80aS1RaK5bVIXpttgSou7XCjWw1ncKA==", + "requires": { + "@babel/runtime": "^7.12.0", + "@emotion/cache": "^11.4.0", + "@emotion/react": "^11.8.1", + "@floating-ui/dom": "^1.0.1", + "@types/react-transition-group": "^4.4.0", + "memoize-one": "^6.0.0", + "prop-types": "^15.6.0", + "react-transition-group": "^4.3.0", + "use-isomorphic-layout-effect": "^1.1.2" + } + }, "react-tabs": { "version": "3.2.3", "resolved": "https://registry.npmjs.org/react-tabs/-/react-tabs-3.2.3.tgz", @@ -40614,6 +41024,28 @@ } } }, + "react-transition-group": { + "version": "4.4.5", + "resolved": "https://registry.npmjs.org/react-transition-group/-/react-transition-group-4.4.5.tgz", + "integrity": "sha512-pZcd1MCJoiKiBR2NRxeCRg13uCXbydPnmB4EOeRrY7480qNWO8IIgQG6zlDkm6uRMsURXPuKq0GWtiM59a5Q6g==", + "requires": { + "@babel/runtime": "^7.5.5", + "dom-helpers": "^5.0.1", + "loose-envify": "^1.4.0", + "prop-types": "^15.6.2" + }, + "dependencies": { + "dom-helpers": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/dom-helpers/-/dom-helpers-5.2.1.tgz", + "integrity": "sha512-nRCa7CK3VTrM2NmGkIy4cbK7IZlgBE/PYMn55rrXefr5xXDP0LdtfPnblFDoVdcAfslJ7or6iqAUnx0CCGIWQA==", + "requires": { + "@babel/runtime": "^7.8.7", + "csstype": "^3.0.2" + } + } + } + }, "react-universal-interface": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/react-universal-interface/-/react-universal-interface-0.6.2.tgz", @@ -41126,11 +41558,11 @@ "integrity": "sha512-LwZrotdHOo12nQuZlHEmtuXdqGoOD0OhaxopaNFxWzInpEgaLWoVuAMbTzixuosCx2nEG58ngzW3vxdWoxIgdg==" }, "resolve": { - "version": "1.22.1", - "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.1.tgz", - "integrity": "sha512-nBpuuYuY5jFsli/JIs1oldw6fOQCBioohqWZg/2hiaOybXOft4lonv85uDOKXdf8rhyK159cxU5cDcK/NKk8zw==", + "version": "1.22.6", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.6.tgz", + "integrity": "sha512-njhxM7mV12JfufShqGy3Rz8j11RPdLy4xi15UurGJeoHLfJpVXKdh3ueuOqbYUcDZnffr6X739JBo5LzyahEsw==", "requires": { - "is-core-module": "^2.9.0", + "is-core-module": "^2.13.0", "path-parse": "^1.0.7", "supports-preserve-symlinks-flag": "^1.0.0" } diff --git a/website/package.json b/website/package.json index afb7a9b1cd4..b0105102359 100644 --- a/website/package.json +++ b/website/package.json @@ -39,6 +39,7 @@ "react-dom": "^17.0.1", "react-full-screen": "^1.1.1", "react-is": "^18.1.0", + "react-select": "^5.7.5", "react-tooltip": "^4.2.21", "redoc": "^2.0.0-rc.57", "rehype-katex": "^5.0.0", diff --git a/website/plugins/buildQuickstartIndexPage/index.js b/website/plugins/buildQuickstartIndexPage/index.js index 4724478883a..368a717a6a5 100644 --- a/website/plugins/buildQuickstartIndexPage/index.js +++ b/website/plugins/buildQuickstartIndexPage/index.js @@ -6,10 +6,13 @@ module.exports = function buildQuickstartIndexPage() { name: 'docusaurus-build-quickstart-index-page-plugin', async loadContent() { // Quickstart files directory - const quickstartDirectory = 'docs/quickstarts' + const quickstartDirectory = 'docs/guides' // Get all Quickstart files and content - const quickstartFiles = fs.readdirSync(quickstartDirectory) + const quickstartFiles = fs.readdirSync(quickstartDirectory, { withFileTypes: true }) + .filter(dirent => dirent.isFile()) + .map(dirent => dirent.name) + const quickstartData = quickstartFiles.reduce((arr, quickstartFile) => { const fileData = fs.readFileSync( @@ -19,8 +22,12 @@ module.exports = function buildQuickstartIndexPage() { if(!fileData) return null - // convert frontmatter to json + // Convert frontmatter to json const fileJson = matter(fileData) + + // Add the original directory to build links + fileJson.data.original_directory = quickstartDirectory.replace('docs/', '') + if(!fileJson) return null @@ -35,7 +42,7 @@ module.exports = function buildQuickstartIndexPage() { async contentLoaded({content, actions}) { const {createData, addRoute} = actions; - // Sort quickstarts by platform if available + // Sort guides by platform if available const contentSorted = content.sort((a, b) => { if(!a?.data?.platform || !b?.data?.platform) return @@ -53,7 +60,7 @@ module.exports = function buildQuickstartIndexPage() { // Build the quickstart index page addRoute({ - path: `/quickstarts`, + path: `/guides`, component: '@site/src/components/quickstartGuideList/index.js', modules: { // propName -> JSON file path diff --git a/website/sidebars.js b/website/sidebars.js index c09e7b784c4..66ba731fb1b 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -7,7 +7,10 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/supported-data-platforms" }, items: [ + "docs/supported-data-platforms", "docs/connect-adapters", + "docs/verified-adapters", + "docs/trusted-adapters", "docs/community-adapters", "docs/contribute-core-adapters", ], @@ -15,24 +18,25 @@ const sidebarSettings = { { type: "category", label: "About dbt Cloud", + link: { type: "doc", id: "docs/cloud/about-cloud/dbt-cloud-features" }, items: [ "docs/cloud/about-cloud/dbt-cloud-features", "docs/cloud/about-cloud/architecture", "docs/cloud/about-cloud/tenancy", "docs/cloud/about-cloud/regions-ip-addresses", - "docs/cloud/about-cloud/about-cloud-ide", "docs/cloud/about-cloud/browsers", ], }, // About dbt Cloud directory { type: "link", - label: "Quickstarts", - href: `/quickstarts`, + label: "Guides", + href: `/guides`, }, { type: "category", label: "Set up dbt", collapsed: true, + link: { type: "doc", id: "docs/about-setup" }, items: [ "docs/about-setup", "docs/environments-in-dbt", @@ -40,12 +44,14 @@ const sidebarSettings = { type: "category", label: "dbt Cloud", collapsed: true, + link: { type: "doc", id: "docs/cloud/about-cloud-setup" }, items: [ "docs/cloud/about-cloud-setup", "docs/dbt-cloud-environments", { type: "category", label: "Connect data platform", + link: { type: "doc", id: "docs/cloud/connect-data-platform/about-connections" }, items: [ "docs/cloud/connect-data-platform/about-connections", "docs/cloud/connect-data-platform/connect-starburst-trino", @@ -59,13 +65,15 @@ const sidebarSettings = { { type: "category", label: "Manage access", + link: { type: "doc", id: "docs/cloud/manage-access/about-user-access" }, items: [ "docs/cloud/manage-access/about-user-access", - "docs/cloud/manage-access/seats-and-users", { type: "category", - label: "Permissions", + label: "User permissions and licenses", + link: { type: "doc", id: "docs/cloud/manage-access/seats-and-users" }, items: [ + "docs/cloud/manage-access/seats-and-users", "docs/cloud/manage-access/self-service-permissions", "docs/cloud/manage-access/enterprise-permissions", ], @@ -73,7 +81,8 @@ const sidebarSettings = { { type: "category", - label: "Single sign-on", + label: "Single sign-on and Oauth", + link: { type: "doc", id: "docs/cloud/manage-access/sso-overview" }, items: [ "docs/cloud/manage-access/sso-overview", "docs/cloud/manage-access/auth0-migration", @@ -81,16 +90,11 @@ const sidebarSettings = { "docs/cloud/manage-access/set-up-sso-okta", "docs/cloud/manage-access/set-up-sso-google-workspace", "docs/cloud/manage-access/set-up-sso-azure-active-directory", - ], - }, // SSO - { - type: "category", - label: "OAuth with data platforms", - items: [ "docs/cloud/manage-access/set-up-snowflake-oauth", + "docs/cloud/manage-access/set-up-databricks-oauth", "docs/cloud/manage-access/set-up-bigquery-oauth", ], - }, // oauth + }, // SSO "docs/cloud/manage-access/audit-log", ], }, // Manage access @@ -98,45 +102,69 @@ const sidebarSettings = { { type: "category", label: "Configure Git", + link: { type: "doc", id: "docs/cloud/git/git-configuration-in-dbt-cloud" }, items: [ + "docs/cloud/git/git-configuration-in-dbt-cloud", + "docs/cloud/git/import-a-project-by-git-url", "docs/cloud/git/connect-github", "docs/cloud/git/connect-gitlab", { type: "category", label: "Azure DevOps", + link: { type: "doc", id: "docs/cloud/git/connect-azure-devops" }, items: [ "docs/cloud/git/connect-azure-devops", "docs/cloud/git/setup-azure", "docs/cloud/git/authenticate-azure", ], }, - "docs/cloud/git/import-a-project-by-git-url", ], }, // Supported Git providers { type: "category", - label: "Develop in the IDE", - link: { - type: "doc", - id: "docs/cloud/dbt-cloud-ide/develop-in-the-cloud", - }, + label: "Develop in dbt Cloud", + link: { type: "doc", id: "docs/cloud/about-cloud-develop" }, items: [ - "docs/cloud/dbt-cloud-ide/ide-user-interface", - "docs/cloud/dbt-cloud-ide/lint-format", - "docs/cloud/dbt-cloud-ide/dbt-cloud-tips", + "docs/cloud/about-cloud-develop", + "docs/cloud/about-cloud-develop-defer", + { + type: "category", + label: "dbt Cloud CLI", + link: { type: "doc", id: "docs/cloud/cloud-cli-installation" }, + items: [ + "docs/cloud/cloud-cli-installation", + "docs/cloud/configure-cloud-cli", + ], + }, + { + type: "category", + label: "dbt Cloud IDE", + link: { type: "doc", id: "docs/cloud/dbt-cloud-ide/develop-in-the-cloud" }, + items: [ + "docs/cloud/dbt-cloud-ide/develop-in-the-cloud", + "docs/cloud/dbt-cloud-ide/ide-user-interface", + "docs/cloud/dbt-cloud-ide/lint-format", + "docs/cloud/dbt-cloud-ide/dbt-cloud-tips", + ], + }, ], - }, // dbt Cloud IDE directory + }, // dbt Cloud develop directory { type: "category", label: "Secure your tenant", + link: { type: "doc", id: "docs/cloud/secure/secure-your-tenant" }, items: [ + "docs/cloud/secure/secure-your-tenant", + "docs/cloud/secure/ip-restrictions", "docs/cloud/secure/about-privatelink", "docs/cloud/secure/snowflake-privatelink", - "docs/cloud/secure/redshift-privatelink", "docs/cloud/secure/databricks-privatelink", + "docs/cloud/secure/redshift-privatelink", + "docs/cloud/secure/postgres-privatelink", "docs/cloud/secure/ip-restrictions", ], }, // PrivateLink + "docs/cloud/billing", ], }, { @@ -145,13 +173,15 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/core/about-core-setup" }, items: [ - "docs/core/about-the-cli", + "docs/core/about-core-setup", + "docs/core/about-dbt-core", "docs/core/dbt-core-environments", { type: "category", label: "Install dbt", link: { type: "doc", id: "docs/core/installation" }, items: [ + "docs/core/installation", "docs/core/homebrew-install", "docs/core/pip-install", "docs/core/docker-install", @@ -166,6 +196,7 @@ const sidebarSettings = { id: "docs/core/connect-data-platform/about-core-connections", }, items: [ + "docs/core/connect-data-platform/about-core-connections", "docs/core/connect-data-platform/profiles.yml", "docs/core/connect-data-platform/connection-profiles", "docs/core/connect-data-platform/bigquery-setup", @@ -207,6 +238,8 @@ const sidebarSettings = { "docs/core/connect-data-platform/databend-setup", "docs/core/connect-data-platform/fal-setup", "docs/core/connect-data-platform/decodable-setup", + "docs/core/connect-data-platform/upsolver-setup", + "docs/core/connect-data-platform/starrocks-setup", ], }, ], @@ -219,68 +252,79 @@ const sidebarSettings = { type: "category", label: "Build dbt projects", collapsed: true, + link: { type: "doc", id: "docs/build/projects" }, items: [ "docs/build/projects", { type: "category", label: "Build your DAG", collapsed: true, + link: { type: "doc", id: "docs/build/models" }, items: [ - "docs/build/sources", { type: "category", label: "Models", + link: { type: "doc", id: "docs/build/models" }, items: [ "docs/build/models", "docs/build/sql-models", "docs/build/python-models", ], }, - "docs/build/seeds", "docs/build/snapshots", + "docs/build/seeds", + "docs/build/tests", + "docs/build/jinja-macros", + "docs/build/sources", "docs/build/exposures", "docs/build/metrics", "docs/build/groups", + "docs/build/analyses", ], }, { type: "category", label: "Build your metrics", - link: { type: "doc", id: "docs/build/build-metrics-intro"}, + link: { type: "doc", id: "docs/build/build-metrics-intro" }, collapsed: true, items: [ + "docs/build/build-metrics-intro", + "docs/build/sl-getting-started", { type: "category", label: "About MetricFlow", link: { type: "doc", id: "docs/build/about-metricflow" }, items: [ + "docs/build/about-metricflow", "docs/build/join-logic", "docs/build/validation", + "docs/build/saved-queries", "docs/build/metricflow-time-spine", - "docs/build/metricflow-cli", - ] + "docs/build/metricflow-commands", + ], }, - "docs/build/sl-getting-started", { type: "category", label: "Semantic models", link: { type: "doc", id: "docs/build/semantic-models" }, items: [ + "docs/build/semantic-models", "docs/build/dimensions", "docs/build/entities", - "docs/build/measures" - ] + "docs/build/measures", + ], }, { type: "category", label: "Metrics", - link: { type: "doc", id: "docs/build/metrics-overview"}, + link: { type: "doc", id: "docs/build/metrics-overview" }, items: [ + "docs/build/metrics-overview", "docs/build/cumulative", "docs/build/derived", "docs/build/ratio", "docs/build/simple", - ] + ], }, ], }, @@ -288,8 +332,9 @@ const sidebarSettings = { type: "category", label: "Enhance your models", collapsed: true, + link: { type: "doc", id: "docs/build/enhance-your-models" }, items: [ - "docs/build/tests", + "docs/build/enhance-your-models", "docs/build/materializations", "docs/build/incremental-models", ], @@ -298,12 +343,12 @@ const sidebarSettings = { type: "category", label: "Enhance your code", collapsed: true, + link: { type: "doc", id: "docs/build/enhance-your-code" }, items: [ - "docs/build/jinja-macros", + "docs/build/enhance-your-code", "docs/build/project-variables", "docs/build/environment-variables", "docs/build/packages", - "docs/build/analyses", "docs/build/hooks-operations", ], }, @@ -311,7 +356,9 @@ const sidebarSettings = { type: "category", label: "Organize your outputs", collapsed: true, + link: { type: "doc", id: "docs/build/organize-your-outputs" }, items: [ + "docs/build/organize-your-outputs", "docs/build/custom-schemas", "docs/build/custom-databases", "docs/build/custom-aliases", @@ -328,24 +375,19 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/deploy/deployments" }, items: [ + "docs/deploy/deployments", "docs/deploy/job-scheduler", "docs/deploy/deploy-environments", + "docs/deploy/continuous-integration", { type: "category", - label: "dbt Cloud jobs", - link: { type: "doc", id: "docs/deploy/dbt-cloud-job" }, + label: "Jobs", + link: { type: "doc", id: "docs/deploy/jobs" }, items: [ - "docs/deploy/job-settings", + "docs/deploy/jobs", + "docs/deploy/deploy-jobs", + "docs/deploy/ci-jobs", "docs/deploy/job-commands", - "docs/deploy/job-triggers", - ], - }, - { - type: "category", - label: "Continuous integration", - link: { type: "doc", id: "docs/deploy/continuous-integration" }, - items: [ - "docs/deploy/slim-ci-jobs", ], }, { @@ -353,7 +395,9 @@ const sidebarSettings = { label: "Monitor jobs and alerts", link: { type: "doc", id: "docs/deploy/monitor-jobs" }, items: [ + "docs/deploy/monitor-jobs", "docs/deploy/run-visibility", + "docs/deploy/retry-jobs", "docs/deploy/job-notifications", "docs/deploy/webhooks", "docs/deploy/artifacts", @@ -367,10 +411,14 @@ const sidebarSettings = { { type: "category", label: "Collaborate with others", + link: { type: "doc", id: "docs/collaborate/collaborate-with-others" }, items: [ + "docs/collaborate/collaborate-with-others", + "docs/collaborate/explore-projects", { type: "category", label: "Git version control", + link: { type: "doc", id: "docs/collaborate/git-version-control" }, items: [ "docs/collaborate/git-version-control", "docs/collaborate/git/version-control-basics", @@ -382,6 +430,7 @@ const sidebarSettings = { { type: "category", label: "Document your dbt projects", + link: { type: "doc", id: "docs/collaborate/documentation" }, items: [ "docs/collaborate/documentation", "docs/collaborate/build-and-view-your-docs", @@ -396,6 +445,7 @@ const sidebarSettings = { id: "docs/collaborate/govern/about-model-governance", }, items: [ + "docs/collaborate/govern/about-model-governance", "docs/collaborate/govern/model-access", "docs/collaborate/govern/model-contracts", "docs/collaborate/govern/model-versions", @@ -408,23 +458,37 @@ const sidebarSettings = { type: "category", label: "Use the dbt Semantic Layer", collapsed: true, + link: { type: "doc", id: "docs/use-dbt-semantic-layer/dbt-sl" }, items: [ - "docs/use-dbt-semantic-layer/quickstart-semantic-layer", - "docs/use-dbt-semantic-layer/dbt-semantic-layer", - "docs/use-dbt-semantic-layer/setup-dbt-semantic-layer", - "docs/use-dbt-semantic-layer/avail-sl-integrations", + "docs/use-dbt-semantic-layer/dbt-sl", + "docs/use-dbt-semantic-layer/quickstart-sl", + "docs/use-dbt-semantic-layer/setup-sl", + "docs/use-dbt-semantic-layer/sl-architecture", + { + type: "category", + label: "Integrations", + link: { type: "doc", id: "docs/use-dbt-semantic-layer/avail-sl-integrations" }, + items: [ + "docs/use-dbt-semantic-layer/avail-sl-integrations", + "docs/use-dbt-semantic-layer/gsheets", + "docs/use-dbt-semantic-layer/tableau", + ], + }, ], }, { type: "category", label: "dbt Cloud APIs", collapsed: true, + link: { type: "doc", id: "docs/dbt-cloud-apis/overview" }, items: [ "docs/dbt-cloud-apis/overview", { type: "category", label: "Authentication", + link: { type: "doc", id: "docs/dbt-cloud-apis/authentication" }, items: [ + "docs/dbt-cloud-apis/authentication", "docs/dbt-cloud-apis/user-tokens", "docs/dbt-cloud-apis/service-tokens", ], @@ -434,6 +498,7 @@ const sidebarSettings = { label: "Administrative API", link: { type: "doc", id: "docs/dbt-cloud-apis/admin-cloud-api" }, items: [ + "docs/dbt-cloud-apis/admin-cloud-api", { type: "link", label: "API v2 (legacy docs)", @@ -456,45 +521,105 @@ const sidebarSettings = { label: "Discovery API", link: { type: "doc", id: "docs/dbt-cloud-apis/discovery-api" }, items: [ + "docs/dbt-cloud-apis/discovery-api", "docs/dbt-cloud-apis/discovery-use-cases-and-examples", "docs/dbt-cloud-apis/project-state", "docs/dbt-cloud-apis/discovery-querying", { type: "category", label: "Schema", + link: { type: "doc", id: "docs/dbt-cloud-apis/discovery-schema-environment" }, items: [ "docs/dbt-cloud-apis/discovery-schema-environment", - "docs/dbt-cloud-apis/discovery-schema-model", - "docs/dbt-cloud-apis/discovery-schema-models", - "docs/dbt-cloud-apis/discovery-schema-modelByEnv", - "docs/dbt-cloud-apis/discovery-schema-metric", - "docs/dbt-cloud-apis/discovery-schema-metrics", - "docs/dbt-cloud-apis/discovery-schema-source", - "docs/dbt-cloud-apis/discovery-schema-sources", - "docs/dbt-cloud-apis/discovery-schema-seed", - "docs/dbt-cloud-apis/discovery-schema-seeds", - "docs/dbt-cloud-apis/discovery-schema-snapshots", - "docs/dbt-cloud-apis/discovery-schema-test", - "docs/dbt-cloud-apis/discovery-schema-tests", - "docs/dbt-cloud-apis/discovery-schema-exposure", - "docs/dbt-cloud-apis/discovery-schema-exposures", + { + type: "category", + label: "Job", + link: { + type: "doc", + id: "docs/dbt-cloud-apis/discovery-schema-job", + }, + items: [ + "docs/dbt-cloud-apis/discovery-schema-job", + "docs/dbt-cloud-apis/discovery-schema-job-model", + "docs/dbt-cloud-apis/discovery-schema-job-models", + "docs/dbt-cloud-apis/discovery-schema-job-metric", + "docs/dbt-cloud-apis/discovery-schema-job-metrics", + "docs/dbt-cloud-apis/discovery-schema-job-source", + "docs/dbt-cloud-apis/discovery-schema-job-sources", + "docs/dbt-cloud-apis/discovery-schema-job-seed", + "docs/dbt-cloud-apis/discovery-schema-job-seeds", + // "docs/dbt-cloud-apis/discovery-schema-job-snapshot", + "docs/dbt-cloud-apis/discovery-schema-job-snapshots", + "docs/dbt-cloud-apis/discovery-schema-job-test", + "docs/dbt-cloud-apis/discovery-schema-job-tests", + "docs/dbt-cloud-apis/discovery-schema-job-exposure", + "docs/dbt-cloud-apis/discovery-schema-job-exposures", + // "docs/dbt-cloud-apis/discovery-schema-job-macro", + // "docs/dbt-cloud-apis/discovery-schema-job-macros", + ], + }, + { + type: "category", + label: "Applied", + items: [ + "docs/dbt-cloud-apis/discovery-schema-environment-applied-modelHistoricalRuns", + ], + }, + // Uncomment to add Definition subpage, but need to make items non-empty + // { + // type: "category", + // label: "Definition", + // items: [ + // // insert pages here + // ], + // }, ], }, ], }, + { + type: "category", + label: "Semantic Layer APIs", + link: { type: "doc", id: "docs/dbt-cloud-apis/sl-api-overview" }, + items: [ + "docs/dbt-cloud-apis/sl-api-overview", + "docs/dbt-cloud-apis/sl-jdbc", + "docs/dbt-cloud-apis/sl-graphql", + "docs/dbt-cloud-apis/sl-manifest", + ], + }, ], }, { type: "category", label: "Available dbt versions", + link: { type: "doc", id: "docs/dbt-versions/core" }, items: [ "docs/dbt-versions/core", "docs/dbt-versions/upgrade-core-in-cloud", "docs/dbt-versions/product-lifecycles", "docs/dbt-versions/experimental-features", + { + type: "category", + label: "dbt Core upgrade guides", + link: { + type: "generated-index", + title: "Version upgrade guides", + description: + "Learn what's new in the latest version of dbt Core.", + slug: "/docs/dbt-versions/core-upgrade", + }, + items: [ + { + type: "autogenerated", + dirName: "docs/dbt-versions/core-upgrade", + }, + ], + }, { type: "category", label: "dbt Cloud Release Notes", + link: { type: "doc", id: "docs/dbt-versions/dbt-cloud-release-notes" }, items: [ "docs/dbt-versions/dbt-cloud-release-notes", { @@ -582,6 +707,8 @@ const sidebarSettings = { "reference/resource-configs/doris-configs", "reference/resource-configs/fal-configs", "reference/resource-configs/oracle-configs", + "reference/resource-configs/upsolver-configs", + "reference/resource-configs/starrocks-configs", ], }, { @@ -589,11 +716,11 @@ const sidebarSettings = { label: "Resource configs and properties", items: [ "reference/configs-and-properties", + "reference/resource-configs/resource-path", { type: "category", label: "General properties", items: [ - "reference/resource-properties/access", "reference/resource-properties/columns", "reference/resource-properties/config", "reference/resource-properties/constraints", @@ -610,6 +737,7 @@ const sidebarSettings = { type: "category", label: "General configs", items: [ + "reference/resource-configs/access", "reference/resource-configs/alias", "reference/resource-configs/database", "reference/resource-configs/enabled", @@ -644,6 +772,7 @@ const sidebarSettings = { "reference/seed-properties", "reference/seed-configs", "reference/resource-configs/column_types", + "reference/resource-configs/delimiter", "reference/resource-configs/quote_columns", ], }, @@ -671,6 +800,7 @@ const sidebarSettings = { "reference/resource-configs/limit", "reference/resource-configs/severity", "reference/resource-configs/store_failures", + "reference/resource-configs/store_failures_as", "reference/resource-configs/where", ], }, @@ -824,10 +954,16 @@ const sidebarSettings = { { type: "category", label: "Database Permissions", - items: ["reference/snowflake-permissions"], + items: [ + "reference/database-permissions/about-database-permissions", + "reference/database-permissions/databricks-permissions", + "reference/database-permissions/postgres-permissions", + "reference/database-permissions/redshift-permissions", + "reference/database-permissions/snowflake-permissions", + ], }, ], - guides: [ + bestpractices: [ { type: "category", label: "Best practices", @@ -836,7 +972,7 @@ const sidebarSettings = { title: "Best practice guides", description: "Learn how dbt Labs approaches building projects through our current viewpoints on structure, style, and setup.", - slug: "/guides/best-practices", + slug: "best-practices", }, items: [ { @@ -844,13 +980,14 @@ const sidebarSettings = { label: "How we structure our dbt projects", link: { type: "doc", - id: "guides/best-practices/how-we-structure/1-guide-overview", + id: "best-practices/how-we-structure/1-guide-overview", }, items: [ - "guides/best-practices/how-we-structure/2-staging", - "guides/best-practices/how-we-structure/3-intermediate", - "guides/best-practices/how-we-structure/4-marts", - "guides/best-practices/how-we-structure/5-the-rest-of-the-project", + "best-practices/how-we-structure/2-staging", + "best-practices/how-we-structure/3-intermediate", + "best-practices/how-we-structure/4-marts", + "best-practices/how-we-structure/5-semantic-layer-marts", + "best-practices/how-we-structure/6-the-rest-of-the-project", ], }, { @@ -858,242 +995,63 @@ const sidebarSettings = { label: "How we style our dbt projects", link: { type: "doc", - id: "guides/best-practices/how-we-style/0-how-we-style-our-dbt-projects", + id: "best-practices/how-we-style/0-how-we-style-our-dbt-projects", }, items: [ - "guides/best-practices/how-we-style/1-how-we-style-our-dbt-models", - "guides/best-practices/how-we-style/2-how-we-style-our-sql", - "guides/best-practices/how-we-style/3-how-we-style-our-python", - "guides/best-practices/how-we-style/4-how-we-style-our-jinja", - "guides/best-practices/how-we-style/5-how-we-style-our-yaml", - "guides/best-practices/how-we-style/6-how-we-style-conclusion", + "best-practices/how-we-style/1-how-we-style-our-dbt-models", + "best-practices/how-we-style/2-how-we-style-our-sql", + "best-practices/how-we-style/3-how-we-style-our-python", + "best-practices/how-we-style/4-how-we-style-our-jinja", + "best-practices/how-we-style/5-how-we-style-our-yaml", + "best-practices/how-we-style/6-how-we-style-conclusion", ], }, { type: "category", - label: "Materializations best practices", + label: "How we build our metrics", link: { type: "doc", - id: "guides/best-practices/materializations/materializations-guide-1-guide-overview", + id: "best-practices/how-we-build-our-metrics/semantic-layer-1-intro", }, items: [ - "guides/best-practices/materializations/materializations-guide-2-available-materializations", - "guides/best-practices/materializations/materializations-guide-3-configuring-materializations", - "guides/best-practices/materializations/materializations-guide-4-incremental-models", - "guides/best-practices/materializations/materializations-guide-5-best-practices", - "guides/best-practices/materializations/materializations-guide-6-examining-builds", - "guides/best-practices/materializations/materializations-guide-7-conclusion", + "best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models", + "best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics", + "best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart", + "best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics", + "best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion", ], }, { type: "category", - label: "dbt Cloud Environment best practices", + label: "How we build our dbt Mesh projects", link: { type: "doc", - id: "guides/best-practices/environment-setup/1-env-guide-overview", + id: "best-practices/how-we-mesh/mesh-1-intro", }, items: [ - "guides/best-practices/environment-setup/2-one-deployment-environment", - "guides/best-practices/environment-setup/3-many-deployment-environments", + "best-practices/how-we-mesh/mesh-2-structures", + "best-practices/how-we-mesh/mesh-3-implementation", ], }, - "guides/best-practices/debugging-errors", - "guides/best-practices/writing-custom-generic-tests", - ], - }, - { - type: "category", - label: "Orchestration", - link: { - type: "generated-index", - title: "Orchestration guides", - description: - "Learn how to orchestrate your data transformations in dbt, using dbt Cloud, a variety of popular tools, or both working together.", - slug: "/guides/orchestration", - }, - items: [ { type: "category", - label: "Airflow and dbt Cloud", + label: "Materialization best practices", link: { type: "doc", - id: "guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud", + id: "best-practices/materializations/materializations-guide-1-guide-overview", }, items: [ - "guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud", - "guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud", - "guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs", + "best-practices/materializations/materializations-guide-2-available-materializations", + "best-practices/materializations/materializations-guide-3-configuring-materializations", + "best-practices/materializations/materializations-guide-4-incremental-models", + "best-practices/materializations/materializations-guide-5-best-practices", + "best-practices/materializations/materializations-guide-6-examining-builds", + "best-practices/materializations/materializations-guide-7-conclusion", ], }, - { - type: "category", - label: "Customizing CI/CD", - link: { - type: "doc", - id: "guides/orchestration/custom-cicd-pipelines/1-cicd-background", - }, - items: [ - "guides/orchestration/custom-cicd-pipelines/2-lint-on-push", - "guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge", - "guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr", - "guides/orchestration/custom-cicd-pipelines/5-something-to-consider", - ], - }, - { - type: "category", - label: "Webhooks with dbt Cloud and SaaS apps", - link: { - type: "generated-index", - title: "Use dbt Cloud's webhooks with other SaaS apps", - description: - "Learn how to use webhooks to trigger actions in other tools by using Zapier or a serverless platform.", - slug: "/guides/orchestration/webhooks", - }, - items: [ - { - type: "autogenerated", - dirName: "guides/orchestration/webhooks", - }, - ], - }, - "guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs", - ], - }, - { - type: "category", - label: "Migration", - items: [ - { - type: "category", - label: "Versions", - link: { - type: "generated-index", - title: "Version migration guides", - description: - "Learn how to upgrade to the latest version of dbt Core.", - slug: "/guides/migration/versions", - }, - items: [ - { - type: "autogenerated", - dirName: "guides/migration/versions", - }, - ], - }, - { - type: "category", - label: "Tools", - link: { - type: "generated-index", - title: "Tool migration guides", - description: - "Learn how to migrate to dbt from other tools and platforms.", - slug: "/guides/migration/tools", - }, - items: [ - { - type: "category", - label: "Migrating from stored procedures", - link: { - type: "doc", - id: "guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures", - }, - items: [ - "guides/migration/tools/migrating-from-stored-procedures/2-inserts", - "guides/migration/tools/migrating-from-stored-procedures/3-updates", - "guides/migration/tools/migrating-from-stored-procedures/4-deletes", - "guides/migration/tools/migrating-from-stored-procedures/5-merges", - "guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion", - ], - }, - "guides/migration/tools/migrating-from-spark-to-databricks", - "guides/migration/tools/refactoring-legacy-sql", - ], - }, - ], - }, - { - type: "category", - label: "dbt Ecosystem", - link: { - type: "generated-index", - title: "dbt Ecosystem guides", - description: "Learn about the dbt ecosystem and how to build with dbt.", - slug: "/guides/dbt-ecosystem/", - }, - items: [ - { - type: "category", - label: "Adapter development", - link: { - type: "doc", - id: "guides/dbt-ecosystem/adapter-development/1-what-are-adapters", - }, - items: [ - "guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter", - "guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", - "guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter", - "guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", - "guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter", - "guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter", - ], - }, - { - type: "category", - label: "dbt Python Snowpark", - link: { - type: "doc", - id: "guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark", - }, - items: [ - "guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration", - "guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source", - "guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt", - "guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name", - "guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure", - "guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure", - "guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging", - "guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations", - "guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations", - "guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep", - "guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction", - "guides/dbt-ecosystem/dbt-python-snowpark/13-testing", - "guides/dbt-ecosystem/dbt-python-snowpark/14-documentation", - "guides/dbt-ecosystem/dbt-python-snowpark/15-deployment", - ], - }, - { - type: "category", - label: "Databricks and dbt", - link: { - type: "doc", - id: "guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project", - }, - items: [ - "guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices", - "guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks", - "guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project", - ], - }, - "guides/dbt-ecosystem/sl-partner-integration-guide", - ], - }, - { - type: "category", - label: "Advanced", - items: [ - "guides/advanced/creating-new-materializations", - "guides/advanced/using-jinja", - ], - }, - { - type: "category", - label: "Legacy", - items: [ - "guides/legacy/debugging-schema-names", - "guides/legacy/best-practices", - "guides/legacy/building-packages", - "guides/legacy/videos", + "best-practices/writing-custom-generic-tests", + "best-practices/best-practice-workflows", + "best-practices/dbt-unity-catalog-best-practices", ], }, ], @@ -1145,6 +1103,7 @@ const sidebarSettings = { "community/resources/oss-expectations", "community/resources/oss-projects", "community/resources/contributor-license-agreements", + "community/resources/jobs-terms-and-conditions", "community/resources/speaking-at-a-meetup", ], }, diff --git a/website/snippets/_adapters-trusted.md b/website/snippets/_adapters-trusted.md new file mode 100644 index 00000000000..7747ce16dec --- /dev/null +++ b/website/snippets/_adapters-trusted.md @@ -0,0 +1,20 @@ +
      + + + + + + + +
      diff --git a/website/snippets/_adapters-verified.md b/website/snippets/_adapters-verified.md new file mode 100644 index 00000000000..ebb91cb4544 --- /dev/null +++ b/website/snippets/_adapters-verified.md @@ -0,0 +1,61 @@ +
      + + + + + + + + + + + + + + + + + + + + + + +
      + diff --git a/website/snippets/_available-tiers-privatelink.md b/website/snippets/_available-tiers-privatelink.md new file mode 100644 index 00000000000..4a3a147d8c6 --- /dev/null +++ b/website/snippets/_available-tiers-privatelink.md @@ -0,0 +1,9 @@ +:::info Limited to certain Enterprise tiers + +The PrivateLink feature is available on the following dbt Cloud Enterprise tiers: + * Business Critical + * Virtual Private + +To learn more about these tiers, contact us at . + +::: \ No newline at end of file diff --git a/website/snippets/_cloud-cli-flag.md b/website/snippets/_cloud-cli-flag.md new file mode 100644 index 00000000000..523591a438c --- /dev/null +++ b/website/snippets/_cloud-cli-flag.md @@ -0,0 +1,5 @@ +:::info Public preview functionality + +The dbt Cloud CLI is currently in [public preview](/docs/dbt-versions/product-lifecycles#dbt-cloud). Share feedback or request features you'd like to see on the [dbt community Slack](https://getdbt.slack.com/archives/C05M77P54FL). + +::: diff --git a/website/snippets/_cloud-environments-info.md b/website/snippets/_cloud-environments-info.md index d8ea7e3d799..2488e1d6c17 100644 --- a/website/snippets/_cloud-environments-info.md +++ b/website/snippets/_cloud-environments-info.md @@ -3,17 +3,17 @@ In dbt Cloud, there are two types of environments: - Deployment environment — Determines the settings used when jobs created within that environment are executed. -- Development environment — Determines the settings used in the dbt Cloud IDE for that particular dbt Cloud project. +- Development environment — Determines the settings used in the dbt Cloud IDE or dbt Cloud CLI, for that particular project. Each dbt Cloud project can only have a single development environment but can have any number of deployment environments. | | Development Environments | Deployment Environments | | --- | --- | --- | -| Determines settings for | dbt Cloud IDE | dbt Cloud Job runs | +| Determines settings for | dbt Cloud IDE or dbt Cloud CLI | dbt Cloud Job runs | | How many can I have in my project? | 1 | Any number | :::note -For users familiar with development on the CLI, each environment is roughly analogous to an entry in your `profiles.yml` file, with some additional information about your repository to ensure the proper version of code is executed. More info on dbt core environments [here](/docs/core/dbt-core-environments). +For users familiar with development on dbt Core, each environment is roughly analogous to an entry in your `profiles.yml` file, with some additional information about your repository to ensure the proper version of code is executed. More info on dbt core environments [here](/docs/core/dbt-core-environments). ::: ## Common environment settings @@ -38,7 +38,43 @@ Both development and deployment environments have a section called **General Set By default, all environments will use the default branch in your repository (usually the `main` branch) when accessing your dbt code. This is overridable within each dbt Cloud Environment using the **Default to a custom branch** option. This setting have will have slightly different behavior depending on the environment type: -- **Development**: determines which branch in the dbt Cloud IDE developers create branches from and open PRs against +- **Development**: determines which branch in the dbt Cloud IDE or dbt Cloud CLI developers create branches from and open PRs against. - **Deployment:** determines the branch is cloned during job executions for each environment. For more info, check out this [FAQ page on this topic](/faqs/Environments/custom-branch-settings)! + + +### Extended attributes (Beta) + +:::important This feature is currently in beta +Extended Attributes is currently in [beta](/docs/dbt-versions/product-lifecycles?) for select users and is subject to change. +::: + +:::note +Extended attributes are retrieved and applied only at runtime when `profiles.yml` is requested for a specific Cloud run. Extended attributes are currently _not_ taken into consideration for Cloud-specific features such as PrivateLink or SSH Tunneling that do not rely on `profiles.yml` values. +::: + +Extended Attributes is a feature that allows users to set a flexible [profiles.yml](/docs/core/connect-data-platform/profiles.yml) snippet in their dbt Cloud Environment settings. It provides users with more control over environments (both deployment and development) and extends how dbt Cloud connects to the data platform within a given environment. + +Extended Attributes is a text box extension at the environment level that overrides connection or environment credentials, including any custom environment variables. You can set any YAML attributes that a dbt adapter accepts in its `profiles.yml`. + +Something to note, Extended Attributes doesn't mask secret values. We recommend avoiding setting secret values to prevent visibility in the text box and logs. + +
      + +If you're developing in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), or [orchestrating job runs](/docs/deploy/deployments), Extended Attributes parses through the provided YAML and extracts the `profiles.yml` attributes. For each individual attribute: + +- If the attribute exists in another source (such as your project settings), it will replace its value (like environment-level values) in the profile. It also overrides any custom environment variables. + +- If the attribute doesn't exist, it will add the attribute or value pair to the profile. + +Only the **top-level keys** are accepted in extended attributes. This means that if you want to change a specific sub-key value, you must provide the entire top-level key as a JSON block in your resulting YAML. For example, if you want to customize a particular field within a [service account JSON](/docs/core/connect-data-platform/bigquery-setup#service-account-json) for your BigQuery connection (like 'project_id' or 'client_email'), you need to provide an override for the entire top-level `keyfile_json` main key/attribute using extended attributes. Include the sub-fields as a nested JSON block. + +The following code is an example of the types of attributes you can add in the **Extended Attributes** text box: + +```yaml +dbname: jaffle_shop +schema: dbt_alice +threads: 4 +``` + diff --git a/website/snippets/_config-prop-callout.md b/website/snippets/_config-prop-callout.md new file mode 100644 index 00000000000..f21c335734a --- /dev/null +++ b/website/snippets/_config-prop-callout.md @@ -0,0 +1 @@ +{props.title} are "special properties" in that you can't configure them in the dbt_project.yml file or using config() blocks. Refer to Configs and properties for more info. diff --git a/website/snippets/_discovery_api_job_deprecation_notice.md b/website/snippets/_discovery_api_job_deprecation_notice.md new file mode 100644 index 00000000000..71e80a958b4 --- /dev/null +++ b/website/snippets/_discovery_api_job_deprecation_notice.md @@ -0,0 +1,7 @@ +:::caution +dbt Labs is making changes to the Discovery API. These changes will take effect on September 7, 2023. + +The data type `Int` for `id` is being deprecated and will be replaced with `BigInt`. Currently, both data types are supported. + +To perform job-based queries, you must do it within the `job` schema object, and move the `jobId` and `runId` arguments to `job(...)`. This is now supported so you can update your API calls accordingly. For details, refer to [Job object schema](/docs/dbt-cloud-apis/discovery-schema-job). +::: diff --git a/website/snippets/_enterprise-permissions-table.md b/website/snippets/_enterprise-permissions-table.md index 75ced50dc2b..3eb313e0f5b 100644 --- a/website/snippets/_enterprise-permissions-table.md +++ b/website/snippets/_enterprise-permissions-table.md @@ -7,7 +7,7 @@ Key: Permissions: * Account-level permissions — Permissions related to management of the dbt Cloud account. For example, billing and account settings. -* Project-level permissions — Permissions related to the projects in dbt Cloud. For example, repos and access to the IDE. +* Project-level permissions — Permissions related to the projects in dbt Cloud. For example, repos and access to the IDE or dbt Cloud CLI. ### Account roles Account roles enable you to manage the dbt Cloud account and manage the account settings (for example, generating service tokens, inviting users, configuring SSO). They also provide project-level permissions. The **Account Admin** role is the highest level of access you can assign. @@ -20,8 +20,10 @@ Account roles enable you to manage the dbt Cloud account and manage the account | Audit logs | R | | | R | | | Auth provider | W | | | W | R | | Billing | W | W | | | R | +| Groups | W | | R | W | R | | Invitations | W | | W | W | R | | IP restrictions | W | | | W | R | +| Licenses | W | | W | W | R | | Members | W | | W | W | R | | Project (create) | W | | W | | | | Public models | R | R | R | R | R | @@ -34,57 +36,55 @@ Account roles enable you to manage the dbt Cloud account and manage the account |:-------------------------|:-------------:|:-------------:|:---------------:|:--------------:|:------:| | Connections | W | | W | | R | | Credentials | W | | W | | R | -| Custom env. variables | W | | W | | R | +| Custom env. variables | W | | W | | R | | dbt adapters | W | | W | | R | -| Develop (IDE) | W | | W | | | +| Develop (IDE or dbt Cloud CLI) | W | | W | | | | Environments | W | | W | | R | -| Groups | W | | R | W | R | | Jobs | W | | W | | R | -| Licenses | W | | W | W | R | | Metadata | R | | R | | R | | Permissions | W | | W | W | R | | Profile | W | | W | | R | | Projects | W | | W | R | R | | Repositories | W | | W | | R | | Runs | W | | W | | R | -| Semantic Layer Config | W | | W | | R | +| Semantic Layer Config | W | | W | | R | ### Project role permissions -The project roles enable you to work within the projects in various capacities. They primarily provide access to project-level permissions such as repos and the IDE, but may also provide some account-level permissions. +The project roles enable you to work within the projects in various capacities. They primarily provide access to project-level permissions such as repos and the IDE or dbt Cloud CLI, but may also provide some account-level permissions. #### Account permissions for project roles -| Account-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job viewer | Metadata | Semantic Layer | Stakeholder | Team admin | Webook | +| Account-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job viewer | Metadata | Semantic Layer | Stakeholder | Team admin | Webhook | |--------------------------|:-----:|:-------:|:--------------:|:---------:|:---------:|:---------:|:-----------:|:--------:|:--------------:|:-----------:|:----------:|:------:| | Account settings | R | | R | | R | | | | | | R | | | Auth provider | | | | | | | | | | | | | | Billing | | | | | | | | | | | | | -| Invitations | W | R | R | R | R | R | R | | | R | R | | -| Members | W | | R | R | R | | | | | R | R | | +| Groups | R | | R | R | R | | | | | R | R | | +| Invitations | W | R | R | R | R | R | R | | | R | R | | +| Licenses | W | R | R | R | R | R | R | | | | R | | +| Members | W | | R | R | R | | | | | R | R | | | Project (create) | | | | | | | | | | | | | -| Public models | R | R | R | R | R | R | R | R | R | R | R | R | +| Public models | R | R | R | R | R | R | R | R | R | R | R | R | | Service tokens | | | | | | | | | | | | | -| Webhooks | W | | | W | | | | | | | | W | +| Webhooks | W | | | W | | | | | | | | W | #### Project permissions for project roles -|Project-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job viewer | Metadata | Semantic Layer | Stakeholder | Team admin | Webook | +|Project-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job viewer | Metadata | Semantic Layer | Stakeholder | Team admin | Webhook | |--------------------------|:-----:|:-------:|:--------------:|:---------:|:---------:|:---------:|:-----------:|:--------:|:--------------:|:-----------:|:----------:|:------:| | Connections | W | R | W | R | R | R | | | | R | R | | | Credentials | W | W | W | W | R | W | | | | R | R | | -| Custom env. variables | W | W | W | W | W | W | R | | | R | W | | +| Custom env. variables | W | W | W | W | W | W | R | | | R | W | | | dbt adapters | W | W | W | W | R | W | | | | R | R | | -| Develop (IDE) | W | W | | W | | | | | | | | | +| Develop (IDE or dbt Cloud CLI) | W | W | | W | | | | | | | | | | Environments | W | R | R | R | R | W | R | | | R | R | | -| Groups | R | | R | R | R | | | | | R | R | | | Jobs | W | R | R | W | R | W | R | | | R | R | | -| Licenses | W | R | R | R | R | R | R | | | | R | | | Metadata | R | R | R | R | R | R | R | R | | R | R | | | Permissions | W | | R | R | R | | | | | | W | | | Profile | W | R | W | R | R | R | | | | R | R | | | Projects | W | W | W | W | W | R | R | | | R | W | | | Repositories | W | | R | R | W | | | | | R | R | | | Runs | W | R | R | W | R | W | R | | | R | R | | -| Semantic Layer Config | W | R | W | R | R | R | | | W | R | R | | \ No newline at end of file +| Semantic Layer Config | W | R | W | R | R | R | | | W | R | R | | diff --git a/website/snippets/_explorer-beta-banner.md b/website/snippets/_explorer-beta-banner.md deleted file mode 100644 index ab501c7bd0f..00000000000 --- a/website/snippets/_explorer-beta-banner.md +++ /dev/null @@ -1,3 +0,0 @@ -:::info Beta -This feature is related to dbt Explorer and cross-project references [beta](/docs/dbt-versions/product-lifecycles#dbt-cloud) projects and subject to change. If you are interested in getting access to the beta, please [contact us](mailto:support@getdbt.com). -::: diff --git a/website/snippets/_explorer-beta-note.md b/website/snippets/_explorer-beta-note.md index 3bdcd5bcf7a..b15bd310a58 100644 --- a/website/snippets/_explorer-beta-note.md +++ b/website/snippets/_explorer-beta-note.md @@ -1 +1 @@ -**Note:** Make sure to set the environment to "Production" so you can take advantage of features like dbt Explorer and cross-project references. Refer to [Set product environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) for details. +**Note:** Make sure to set the environment to "Production" so you can take advantage of features like dbt Explorer and cross-project references. Refer to [Set product environment](/docs/deploy/deploy-environments#set-as-production-environment) for details. diff --git a/website/snippets/_legacy-sl-callout.md b/website/snippets/_legacy-sl-callout.md new file mode 100644 index 00000000000..b0575b60729 --- /dev/null +++ b/website/snippets/_legacy-sl-callout.md @@ -0,0 +1,9 @@ +:::important Upgrade to access the latest dbt Semantic Layer + +The dbt Semantic Layer has undergone a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), introducing new features such as dbt Semantic Layer APIs. The APIs integrate with data applications, such as Tableau and Google Sheets, to query metrics and unlock insights. + +For users of the dbt Semantic Layer on version 1.5 or lower — Support for dbt Metrics and the legacy dbt Semantic Layer ends on December 15th, 2023. To access the latest features, migrate to the updated version using the [dbt Semantic Layer migration guide](/guides/sl-migration). + +After December 15th, dbt Labs will no longer support these deprecated features, they will be removed from the dbt Cloud user interface, and their documentation removed from the docs site. + +::: diff --git a/website/snippets/_manifest-versions.md b/website/snippets/_manifest-versions.md new file mode 100644 index 00000000000..c9b3e7af6ec --- /dev/null +++ b/website/snippets/_manifest-versions.md @@ -0,0 +1,11 @@ + +| dbt Core version | Manifest version | +|------------------|---------------------------------------------------------------| +| v1.7 | [v11](https://schemas.getdbt.com/dbt/manifest/v11/index.html) | +| v1.6 | [v10](https://schemas.getdbt.com/dbt/manifest/v10/index.html) | +| v1.5 | [v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html) | +| v1.4 | [v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html) | +| v1.3 | [v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html) | +| v1.2 | [v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html) | +| v1.1 | [v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html) | +| v1.0 | [v4](https://schemas.getdbt.com/dbt/manifest/v4/index.html) | \ No newline at end of file diff --git a/website/snippets/_microsoft-adapters-soon.md b/website/snippets/_microsoft-adapters-soon.md new file mode 100644 index 00000000000..927c9d2e5ca --- /dev/null +++ b/website/snippets/_microsoft-adapters-soon.md @@ -0,0 +1,3 @@ +:::tip Coming soon +dbt Cloud support for the Azure Synapse Analytics adapter is coming soon! +::: \ No newline at end of file diff --git a/website/snippets/_new-sl-changes.md b/website/snippets/_new-sl-changes.md new file mode 100644 index 00000000000..6eca327001a --- /dev/null +++ b/website/snippets/_new-sl-changes.md @@ -0,0 +1,8 @@ + +:::tip Introducing the new dbt Semantic Layer 🎉 + +The dbt Semantic Layer has been re-released with [significant improvements](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. + +The new version is available in [public beta](/docs/dbt-versions/release-notes/Aug-2023/sl-revamp-beta#public-beta) and introduces [MetricFlow](/docs/build/about-metricflow), an essential component. It also includes new semantic elements, better governance, improved efficiency, easier data access, and new dbt Semantic Layer APIs. + +::: diff --git a/website/snippets/_new-sl-setup.md b/website/snippets/_new-sl-setup.md new file mode 100644 index 00000000000..3cb6e09eb4c --- /dev/null +++ b/website/snippets/_new-sl-setup.md @@ -0,0 +1,39 @@ +You can set up the dbt Semantic Layer in dbt Cloud at the environment and project level. Before you begin: + +- You must have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment. Single-tenant coming soon. +- You must be part of the Owner group, and have the correct [license](/docs/cloud/manage-access/seats-and-users) and [permissions](/docs/cloud/manage-access/self-service-permissions) to configure the Semantic Layer: + * Enterprise plan — Developer license with Account Admin permissions. Or Owner with a Developer license, assigned Project Creator, Database Admin, or Admin permissions. + * Team plan — Owner with a Developer license. +- You must have a successful run in your new environment. + +:::tip +If you're using the legacy Semantic Layer, dbt Labs strongly recommends that you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt version 1.6 or newer to use the latest dbt Semantic Layer. Refer to the dedicated [migration guide](/guides/sl-migration) for details. +::: + +1. In dbt Cloud, create a new [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) or use an existing environment on dbt 1.6 or higher. + * Note — Deployment environment is currently supported (_development experience coming soon_) + +2. Navigate to **Account Settings** and select the specific project you want to enable the Semantic Layer for. + +3. In the **Project Details** page, navigate to the **Semantic Layer** section, and select **Configure Semantic Layer**. + + + +4. In the **Set Up Semantic Layer Configuration** page, enter the credentials you want the Semantic Layer to use specific to your data platform. We recommend credentials have the least privileges required because your Semantic Layer users will be querying it in downstream applications. At a minimum, the Semantic Layer needs to have read access to the schema(s) that contains the dbt models that you used to build your semantic models. + + + +5. Select the deployment environment you want for the Semantic Layer and click **Save**. + +6. After saving it, you'll be provided with the connection information that allows you to connect to downstream tools. If your tool supports JDBC, save the JDBC URL or individual components (like environment id and host). If it uses the GraphQL API, save the GraphQL API host information instead. + + + +7. Save and copy your environment ID, service token, and host, which you'll need to use downstream tools. For more info on how to integrate with partner integrations, refer to [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations). + +8. Return to the **Project Details** page, then select **Generate Service Token**. You will need Semantic Layer Only and Metadata Only [service token](/docs/dbt-cloud-apis/service-tokens) permissions. + + + +Great job, you've configured the Semantic Layer 🎉! + diff --git a/website/snippets/_onrunstart-onrunend-commands.md b/website/snippets/_onrunstart-onrunend-commands.md new file mode 100644 index 00000000000..68d693ce426 --- /dev/null +++ b/website/snippets/_onrunstart-onrunend-commands.md @@ -0,0 +1 @@ +dbt build, dbt compile, dbt docs generate, dbt run, dbt seed, dbt snapshot, or dbt test. diff --git a/website/snippets/_sl-configure-metricflow.md b/website/snippets/_sl-configure-metricflow.md new file mode 100644 index 00000000000..10f92161783 --- /dev/null +++ b/website/snippets/_sl-configure-metricflow.md @@ -0,0 +1 @@ +MetricFlow requires a time spine for certain metric types and join resolution patterns, like cumulative metrics. You will have to create this model in your dbt project. [This article](/docs/build/metricflow-time-spine) explains how to add the `metricflow_time_spine` model to your project. diff --git a/website/snippets/_sl-connect-and-query-api.md b/website/snippets/_sl-connect-and-query-api.md new file mode 100644 index 00000000000..429f41c3bf6 --- /dev/null +++ b/website/snippets/_sl-connect-and-query-api.md @@ -0,0 +1,10 @@ +You can query your metrics in a JDBC-enabled tool or use existing first-class integrations with the dbt Semantic Layer. + +You must have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment. Single-tenant coming soon. + +- To learn how to use the JDBC or GraphQL API and what tools you can query it with, refer to [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview). + + * To authenticate, you need to [generate a service token](/docs/dbt-cloud-apis/service-tokens) with Semantic Layer Only and Metadata Only permissions. + * Refer to the [SQL query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + +- To learn more about the sophisticated integrations that connect to the dbt Semantic Layer, refer to [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) for more info. diff --git a/website/snippets/_sl-create-semanticmodel.md b/website/snippets/_sl-create-semanticmodel.md new file mode 100644 index 00000000000..6e0376ab10b --- /dev/null +++ b/website/snippets/_sl-create-semanticmodel.md @@ -0,0 +1,150 @@ +The following steps describe how to set up semantic models. Semantic models consist of [entities](/docs/build/entities), [dimensions](/docs/build/dimensions), and [measures](/docs/build/measures). + +We highly recommend you read the overview of what a [semantic model](/docs/build/semantic-models) is before getting started. If you're working in the [Jaffle shop example](https://github.com/dbt-labs/jaffle-sl-template), delete the `orders.yml` config or delete the .yml extension so it's ignored during parsing. **We'll be rebuilding it step by step in this example.** + +If you're following the guide in your own project, pick a model that you want to build a semantic manifest from and fill in the config values accordingly. + +1. Create a new yml config file for the orders model, such as `orders.yml`. + +It's best practice to create semantic models in the `/models/semantic_models` directory in your project. Semantic models are nested under the `semantic_models` key. First, fill in the name and appropriate metadata, map it to a model in your dbt project, and specify model defaults. For now, `default_agg_time_dimension` is the only supported default. + +```yaml +semantic_models: + #The name of the semantic model. + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + #The name of the dbt model and schema + model: ref('orders') + ``` + +2. Define your entities. These are the keys in your table that MetricFlow will use to join other semantic models. These are usually columns like `customer_id`, `order_id`, and so on. + +```yaml + #Entities. These usually correspond to keys in the table. + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + ``` + +3. Define your dimensions and measures. Dimensions are properties of the records in your table that are non-aggregatable. They provide categorical or time-based context to enrich metrics. Measures are the building block for creating metrics. They are numerical columns that MetricFlow aggregates to create metrics. + +```yaml + #Measures. These are the aggregations on the columns in the table. + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum + - name: customers_with_orders + description: Distinct count of customers placing orders + agg: count_distinct + expr: customer_id + - name: locations_with_orders + description: Distinct count of locations with order + expr: location_id + agg: count_distinct + - name: order_cost + description: The cost for each order item. Cost is calculated as a sum of the supply cost for each order item. + agg: sum + #Dimensions. Either categorical or time. These add additional context to metrics. The typical querying pattern is Metric by Dimension. + dimensions: + - name: ordered_at + type: time + type_params: + time_granularity: day + - name: order_total_dim + type: categorical + expr: order_total + - name: is_food_order + type: categorical + - name: is_drink_order + type: categorical +``` + +Putting it all together, a complete semantic model configurations based on the order model would look like the following example: + +```yaml +semantic_models: + #The name of the semantic model. + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + #The name of the dbt model and schema + model: ref('orders') + #Entities. These usually corespond to keys in the table. + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + #Measures. These are the aggregations on the columns in the table. + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum + - name: customers_with_orders + description: Distinct count of customers placing orders + agg: count_distinct + expr: customer_id + - name: locations_with_orders + description: Distinct count of locations with order + expr: location_id + agg: count_distinct + - name: order_cost + description: The cost for each order item. Cost is calculated as a sum of the supply cost for each order item. + agg: sum + #Dimensions. Either categorical or time. These add additional context to metrics. The typical querying pattern is Metric by Dimension. + dimensions: + - name: ordered_at + type: time + type_params: + time_granularity: day + - name: order_total_dim + type: categorical + expr: order_total + - name: is_food_order + type: categorical + - name: is_drink_order + type: categorical +``` + +:::tip +If you're familiar with writing SQL, you can think of dimensions as the columns you would group by and measures as the columns you would aggregate. + +```sql +select + metric_time_day, -- time + country, -- categorical dimension + sum(revenue_usd) -- measure +from + snowflake.fact_transactions -- sql table +group by metric_time_day, country -- dimensions + ``` + +::: diff --git a/website/snippets/_sl-define-metrics.md b/website/snippets/_sl-define-metrics.md new file mode 100644 index 00000000000..af3ee9f297f --- /dev/null +++ b/website/snippets/_sl-define-metrics.md @@ -0,0 +1,21 @@ +Now that you've created your first semantic model, it's time to define your first metric! You can define metrics with the dbt Cloud IDE or command line. + +MetricFlow supports different metric types like [simple](/docs/build/simple), [ratio](/docs/build/ratio), [cumulative](/docs/build/cumulative), and [derived](/docs/build/derived). It's recommended that you read the [metrics overview docs](/docs/build/metrics-overview) before getting started. + +1. You can define metrics in the same YAML files as your semantic models or create a new file. If you want to create your metrics in a new file, create another directory called `/models/metrics`. The file structure for metrics can become more complex from here if you need to further organize your metrics, for example, by data source or business line. + +2. The example metric we'll create is a simple metric that refers directly to the `order_total` measure, which will be implemented as a `sum()` function in SQL. Again, if you're working in the Jaffle shop sandbox, we recommend deleting the original `orders.yml` file, or removing the .yml extension so it's ignored during parsing. We'll be rebuilding the `order_total` metric from scratch. If you're working in your own project, create a simple metric like the one below using one of the measures you created in the previous step. + +```yaml +metrics: + - name: order_total + description: Sum of total order amount. Includes tax + revenue. + type: simple + label: Order Total + type_params: + measure: order_total +``` + +3. Save your code, and in the next section, you'll validate your configs before committing them to your repository. + +To continue building out your metrics based on your organization's needs, refer to the [Build your metrics](/docs/build/build-metrics-intro) for detailed info on how to define different metric types and semantic models. diff --git a/website/snippets/_sl-deprecation-notice.md b/website/snippets/_sl-deprecation-notice.md new file mode 100644 index 00000000000..19bf19c2d90 --- /dev/null +++ b/website/snippets/_sl-deprecation-notice.md @@ -0,0 +1,7 @@ +:::info Deprecation of dbt Metrics and the legacy dbt Semantic Layer +For users of the dbt Semantic Layer on version 1.5 or lower — Support for dbt Metrics and the legacy dbt Semantic Layer ends on December 15th, 2023. To access the latest features, migrate to the updated version using the [dbt Semantic Layer migration guide](/guides/sl-migration). + + +After December 15th, dbt Labs will no longer support these deprecated features, they will be removed from the dbt Cloud user interface, and their documentation removed from the docs site. + +::: diff --git a/website/snippets/_sl-partner-links.md b/website/snippets/_sl-partner-links.md new file mode 100644 index 00000000000..c97c682171b --- /dev/null +++ b/website/snippets/_sl-partner-links.md @@ -0,0 +1,105 @@ +The following tools integrate with the dbt Semantic Layer: + + + +
      + + + + + +
      + + + + +
      + +
      + + + + +
      + +
      + + + + +
      + +
      + + + + +
      + +
      + + + + +
      + +
      + + + + +
      + +

      + +Before you connect to these tools, you'll need to first [set up the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) and [generate a service token](/docs/dbt-cloud-apis/service-tokens) to create **Semantic Layer Only** and **Metadata Only** permissions. diff --git a/website/snippets/_sl-plan-info.md b/website/snippets/_sl-plan-info.md new file mode 100644 index 00000000000..083ab2209bc --- /dev/null +++ b/website/snippets/_sl-plan-info.md @@ -0,0 +1,2 @@ +To define and query metrics with the {props.product}, you must be on a {props.plan} multi-tenant plan .


      + diff --git a/website/snippets/_sl-run-prod-job.md b/website/snippets/_sl-run-prod-job.md new file mode 100644 index 00000000000..a637b0b431e --- /dev/null +++ b/website/snippets/_sl-run-prod-job.md @@ -0,0 +1,7 @@ +Once you’ve defined metrics in your dbt project, you can perform a job run in your deployment environment in dbt Cloud to materialize your metrics. The deployment environment is only supported for the dbt Semantic Layer currently. + +1. Select **Deploy** from the top navigation bar. +2. Select **Jobs** to rerun the job with the most recent code in the deployment environment. +3. Your metric should appear as a red node in the dbt Cloud IDE and dbt directed acyclic graphs (DAG). + + diff --git a/website/snippets/_sl-test-and-query-metrics.md b/website/snippets/_sl-test-and-query-metrics.md new file mode 100644 index 00000000000..43ebd929cb3 --- /dev/null +++ b/website/snippets/_sl-test-and-query-metrics.md @@ -0,0 +1,68 @@ +This section explains how you can test and run MetricFlow commands with dbt Cloud or dbt Core (dbt Cloud IDE support coming soon). dbt Cloud IDE users can skip to [Run a production job](#run-a-production-job) to run a model. + +:::important Testing and querying metrics in the dbt Cloud IDE is currently not supported + +Support for running [MetricFlow commands](/docs/build/metricflow-commands) in the dbt Cloud IDE is not available but is coming soon. + +You can use the **Preview** or **Compile** buttons in the IDE to run semantic validations and make sure your metrics are defined. Alternatively, you can run commands with the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) or with SQL client tools like DataGrip, DBeaver, or RazorSQL. + +::: + + + + + + + +This section is for people using the dbt Cloud CLI (support for dbt Cloud IDE is coming soon). With dbt Cloud: + +- You can run MetricFlow commands after installing the dbt Cloud CLI. They're integrated with dbt Cloud so you can use them immediately. +- Your account will automatically manage version control for you. + +To get started: + +1. Make sure you've installed the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). +2. Navigate to your dbt project directory. +3. Run a dbt command, such as `dbt parse`, `dbt run`, `dbt compile`, or `dbt build`. If you don't, you'll receive an error message that begins with: "ensure that you've ran an artifacts...." + - MetricFlow builds a semantic graph and generates a `semantic_manifest.json` file in dbt Cloud, which is stored in the `/target` directory. If using the Jaffle shop example, run `dbt seed && dbt run` to ensure the required data is in your data platform before proceeding. + +4. Run `dbt sl --help` to confirm you have MetricFlow installed and that you can view the available commands. +5. Run `dbt sl query --metrics --group-by ` to query the metrics and dimensions. For example, `dbt sl query --metrics order_total --group-by metric_time` +6. Verify that the metric values are what you expect. To further understand how the metric is being generated, you can view the generated SQL if you type `--compile` in the command line. +7. Commit and merge the code changes that contain the metric definitions. + +To streamline your metric querying process, you can connect to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) to access your metrics programmatically. For SQL syntax, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + + + + + + + +This step is for dbt Core users only. MetricFlow is compatible with Python versions 3.8, 3.9, 3.10 and 3.11. You need to use `pip` to install MetricFlow on Windows or Linux operating systems: + +:::note +The dbt Cloud CLI is strongly recommended to define and query metrics for your dbt project in dbt Cloud or dbt Core with MetricFlow. If you're using dbt Core, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. +::: + + +1. Install [MetricFlow](/docs/build/metricflow-commands) as an extension of a dbt adapter from PyPI. +2. Create or activate your virtual environment with `python -m venv venv` or `source your-venv/bin/activate`. +3. Run `pip install dbt-metricflow`. + - You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. As an example for a Snowflake adapter, run `pip install "dbt-metricflow[snowflake]"`. + - You'll need to manage versioning between dbt Core, your adapter, and MetricFlow. +4. Run `dbt parse`. This allows MetricFlow to build a semantic graph and generate a `semantic_manifest.json`. + - This creates the file in your `/target` directory. If you're working from the Jaffle shop example, run `dbt seed && dbt run` before proceeding to ensure the data exists in your warehouse. +5. Run `mf --help` to confirm you have MetricFlow installed and that you can view the available commands. +6. Run `mf query --metrics --group-by ` to query the metrics and dimensions. For example, `mf query --metrics order_total --group-by metric_time`. +7. Verify that the metric values are what you expect. To further understand how the metric is being generated, you can view the generated SQL if you type `--explain` in the command line. +8. Run `mf validate-configs` to run validation on your semantic models and metrics. +9. Commit and merge the code changes that contain the metric definitions. + +To streamline your metric querying process, you can connect to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) to access your metrics programmatically. For SQL syntax, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + + + + + + diff --git a/website/snippets/_upgrade-move.md b/website/snippets/_upgrade-move.md new file mode 100644 index 00000000000..7572077fd1b --- /dev/null +++ b/website/snippets/_upgrade-move.md @@ -0,0 +1,5 @@ +:::important Upgrade Guides Are Moving + +The location of the dbt Core upgrade guides has changed, and they will soon be removed from `Guides`. The new location is in the `Docs` tab under `Available dbt versions`. You have been redirected to the new URL, so please update any saved links and bookmarks. + +::: \ No newline at end of file diff --git a/website/snippets/_v2-sl-prerequisites.md b/website/snippets/_v2-sl-prerequisites.md new file mode 100644 index 00000000000..c80db4d1c8f --- /dev/null +++ b/website/snippets/_v2-sl-prerequisites.md @@ -0,0 +1,39 @@ + + + +- Have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment. Single-tenant coming soon. +- Have both your production and development environments running dbt version 1.6 or higher. Refer to [upgrade in dbt Cloud](/docs/dbt-versions/upgrade-core-in-cloud) for more info. +- Use Snowflake, BigQuery, Databricks, or Redshift. +- Create a successful run in the environment where you configure the Semantic Layer. + - **Note:** Semantic Layer currently supports the Deployment environment for querying. (_development querying experience coming soon_) +- Set up the [Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) in the integrated tool to import metric definitions. + - To access the API and query metrics in downstream tools, you must have a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. dbt Core or Developer accounts can define metrics but won't be able to dynamically query them.
      +- Understand [MetricFlow's](/docs/build/about-metricflow) key concepts, which powers the latest dbt Semantic Layer. +- Note that SSH tunneling for [Postgres and Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) connections, [PrivateLink](/docs/cloud/secure/about-privatelink), and [Single sign-on (SSO)](/docs/cloud/manage-access/sso-overview) isn't supported yet. + +
      + + + + +- Have a multi-tenant dbt Cloud instance, hosted in North America
      +- Have both your production and development environments running dbt version 1.3 or higher
      +- Use Snowflake data platform
      +- Install the dbt metrics package version >=1.3.0, <1.4.0 in your dbt project
      + * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. +- Set up the Discovery API in the integrated tool to import metric definitions + * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      + +
      + + + +- Have a multi-tenant dbt Cloud instance, hosted in North America
      +- Have both your production and development environments running dbt version 1.2
      +- Use Snowflake data platform
      +- Install the dbt metrics package version >=0.3.0, <0.4.0 in your dbt project
      + * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. +- Set up the Discovery API in the integrated tool to import metric definitions + * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      + +
      diff --git a/website/snippets/_version-callout.md b/website/snippets/_version-callout.md new file mode 100644 index 00000000000..45c183c2c0d --- /dev/null +++ b/website/snippets/_version-callout.md @@ -0,0 +1,8 @@ + +:::info Model versions, dbt_project.yml versions, and .yml versions + +Take note that [model versions](/docs/collaborate/govern/model-versions) are different from [dbt_project.yml versions](/reference/project-configs/version#dbt_projectyml-versions) and [.yml property file versions](/reference/project-configs/version#yml-property-file-versions). + +Model versions is a _feature_ that enables better governance and data model management by allowing you to track changes and updates to models over time. dbt_project.yml versions refer to the compatibility of the dbt project with a specific version of dbt. Version numbers within .yml property files inform how dbt parses those YAML files. The latter two are completely optional starting from dbt v1.5. + +::: diff --git a/website/snippets/auth0-uri.md b/website/snippets/auth0-uri.md index c6e0f48d0df..1187902f2e4 100644 --- a/website/snippets/auth0-uri.md +++ b/website/snippets/auth0-uri.md @@ -3,8 +3,9 @@ The URI used for SSO connections on multi-tenant dbt Cloud instances will vary b | Region | dbt Cloud Access URL | Auth0 SSO URI | Auth0 Entity ID * | |--------|-----------------------|-------------------------------|----------------------------------------| -| US | cloud.getdbt.com | https://auth.cloud.getdbt.com/ | us-production-mt | -| EMEA | emea.dbt.com | https://auth.emea.dbt.com/ | emea-production-mt | -| APAC | au.dbt.com | https://auth.au.dbt.com/ | au-production-mt | +| US multi-tenant | cloud.getdbt.com | auth.cloud.getdbt.com | us-production-mt | +| US cell 1 | {account prefix}.us1.dbt.com | auth.cloud.getdbt.com | us-production-mt | +| EMEA | emea.dbt.com | auth.emea.dbt.com | emea-production-mt | +| APAC | au.dbt.com | auth.au.dbt.com | au-production-mt | -*Only applicable to SAML and Okta configurations. \ No newline at end of file +*Only applicable to SAML and Okta configurations. diff --git a/website/snippets/available-enterprise-tier-only.md b/website/snippets/available-enterprise-tier-only.md index 0d75b72287e..184b8da6c34 100644 --- a/website/snippets/available-enterprise-tier-only.md +++ b/website/snippets/available-enterprise-tier-only.md @@ -2,6 +2,6 @@ Connecting an Azure DevOps cloud account is available for organizations using the dbt Cloud Enterprise tier. -Azure DevOps on-premise instances are not supported in dbt Cloud. +dbt Cloud's native Azure DevOps integration does not support Azure DevOps Server (on-premise). Instead, you can [import a project by git URL](/docs/cloud/git/import-a-project-by-git-url) to connect to an Azure DevOps Server. ::: diff --git a/website/snippets/cloud-feature-parity.md b/website/snippets/cloud-feature-parity.md index bcaa2ef3784..7bc6c91e9ba 100644 --- a/website/snippets/cloud-feature-parity.md +++ b/website/snippets/cloud-feature-parity.md @@ -5,10 +5,10 @@ The following table outlines which dbt Cloud features are supported on the diffe | Scheduler | ✅ | ✅ | ✅ | | Cloud IDE | ✅ | ✅ | ✅ | | Audit logs | ✅ | ✅ | ✅ | -| Discovery API | ✅ | ✅ (select customers) | ❌ | -| Webhooks (Outbound) | ✅ | ❌ | ❌ | -| Continuous Integration, including Slim CI | ✅ | ✅ | ✅ | +| Discovery API | ✅ | ✅ | ❌ | +| Webhooks (Outbound) | ✅ | ✅ | ❌ | +| Continuous Integration, including CI jobs | ✅ | ✅ | ✅ | | Semantic Layer | ✅ (North America Only) | ❌ | ❌ | | IP Restrictions | ✅ | ✅ | ✅ | -| PrivateLink egress | ✅ | ✅ | ✅ | +| PrivateLink egress | ✅ (AWS only)| ✅ | ✅ | | PrivateLink ingress | ❌ | ✅ | ✅ | diff --git a/website/snippets/connect-starburst-trino/roles-starburst-enterprise.md b/website/snippets/connect-starburst-trino/roles-starburst-enterprise.md index ba11508f1b4..f832d52be20 100644 --- a/website/snippets/connect-starburst-trino/roles-starburst-enterprise.md +++ b/website/snippets/connect-starburst-trino/roles-starburst-enterprise.md @@ -1,3 +1,6 @@ -[comment: For context, the section title used for this snippet is "Roles in Starburst Enterprise" ]: # +[comment: For context, the section title used for this snippet is "Roles in Starburst Enterprise" ]: # -If connecting to a Starburst Enterprise cluster with built-in access controls enabled, you can't add the role as a suffix to the username, so the default role for the provided username is used instead. +If connecting to a Starburst Enterprise cluster with built-in access controls +enabled, you must specify a role using the format detailed in [Additional +parameters](#additional-parameters). If a role is not specified, the default +role for the provided username is used. \ No newline at end of file diff --git a/website/snippets/core-versions-table.md b/website/snippets/core-versions-table.md index 5832f9f14c3..f1241d8301b 100644 --- a/website/snippets/core-versions-table.md +++ b/website/snippets/core-versions-table.md @@ -2,21 +2,13 @@ | dbt Core | Initial Release | Support Level | Critical Support Until | |------------------------------------------------------------|-----------------|----------------|-------------------------| -| [**v1.6**](/guides/migration/versions/upgrading-to-v1.6) | Jul 31, 2023 | Active | Jul 30, 2024 | -| [**v1.5**](/guides/migration/versions/upgrading-to-v1.5) | Apr 27, 2023 | Critical | Apr 27, 2024 | -| [**v1.4**](/guides/migration/versions/upgrading-to-v1.4) | Jan 25, 2023 | Critical | Jan 25, 2024 | -| [**v1.3**](/guides/migration/versions/upgrading-to-v1.3) | Oct 12, 2022 | Critical | Oct 12, 2023 | -| [**v1.2**](/guides/migration/versions/upgrading-to-v1.2) | Jul 26, 2022 | End of Life* ⚠️ | Jul 26, 2023 | -| [**v1.1**](/guides/migration/versions/upgrading-to-v1.1) ⚠️ | Apr 28, 2022 | End of Life* ⚠️ | Apr 28, 2023 | -| [**v1.0**](/guides/migration/versions/upgrading-to-v1.0) ⚠️ | Dec 3, 2021 | End of Life* ⚠️ | Dec 3, 2022 ⚠️ | +| [**v1.7**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.7) | Nov 2, 2023 | Active | Nov 1, 2024 | +| [**v1.6**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.6) | Jul 31, 2023 | Critical | Jul 30, 2024 | +| [**v1.5**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.5) | Apr 27, 2023 | Critical | Apr 27, 2024 | +| [**v1.4**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.4) | Jan 25, 2023 | Critical | Jan 25, 2024 | +| [**v1.3**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.3) | Oct 12, 2022 | End of Life* ⚠️ | Oct 12, 2023 | +| [**v1.2**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.2) | Jul 26, 2022 | End of Life* ⚠️ | Jul 26, 2023 | +| [**v1.1**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.1) ⚠️ | Apr 28, 2022 | Deprecated ⛔️ | Deprecated ⛔️ | +| [**v1.0**](/docs/dbt-versions/core-upgrade/upgrading-to-v1.0) ⚠️ | Dec 3, 2021 | Deprecated ⛔️ | Deprecated ⛔️ | | **v0.X** ⛔️ | (Various dates) | Deprecated ⛔️ | Deprecated ⛔️ | _*All versions of dbt Core since v1.0 are available in dbt Cloud until further notice. Versions that are EOL do not receive any fixes. For the best support, we recommend upgrading to a version released within the past 12 months._ -### Planned future releases - -_Future release dates are tentative and subject to change._ - -| dbt Core | Planned Release | Critical & dbt Cloud Support Until | -|----------|-----------------|-------------------------------------| -| **v1.7** | _Oct 2023_ | _Oct 2024_ | -| **v1.8** | _Jan 2024_ | _Jan 2025_ | -| **v1.9** | _Apr 2024_ | _Apr 2025_ | diff --git a/website/snippets/dbt-databricks-for-databricks.md b/website/snippets/dbt-databricks-for-databricks.md index 930e7a85a9f..f1c5ec84af1 100644 --- a/website/snippets/dbt-databricks-for-databricks.md +++ b/website/snippets/dbt-databricks-for-databricks.md @@ -1,4 +1,4 @@ :::info If you're using Databricks, use `dbt-databricks` If you're using Databricks, the `dbt-databricks` adapter is recommended over `dbt-spark`. -If you're still using dbt-spark with Databricks consider [migrating from the dbt-spark adapter to the dbt-databricks adapter](/guides/migration/tools/migrating-from-spark-to-databricks#migrate-your-dbt-projects). -::: \ No newline at end of file +If you're still using dbt-spark with Databricks consider [migrating from the dbt-spark adapter to the dbt-databricks adapter](/guides/migrate-from-spark-to-databricks). +::: diff --git a/website/snippets/metadata-api-prerequisites.md b/website/snippets/metadata-api-prerequisites.md index 35532e28bdc..6e2d1550223 100644 --- a/website/snippets/metadata-api-prerequisites.md +++ b/website/snippets/metadata-api-prerequisites.md @@ -2,5 +2,5 @@ - dbt Cloud [multi-tenant](/docs/cloud/about-cloud/tenancy#multi-tenant) or [single tenant](/docs/cloud/about-cloud/tenancy#single-tenant) account - You must be on a [Team or Enterprise plan](https://www.getdbt.com/pricing/) -- Your projects must be on dbt version 1.0 or higher. Refer to [Version migration guides](/guides/migration/versions) to upgrade +- Your projects must be on dbt version 1.0 or higher. Refer to [Version migration guides](/docs/dbt-versions/core-upgrade) to upgrade diff --git a/website/snippets/quickstarts/schedule-a-job.md b/website/snippets/quickstarts/schedule-a-job.md index 59d428bdfaa..ab8f4350dbf 100644 --- a/website/snippets/quickstarts/schedule-a-job.md +++ b/website/snippets/quickstarts/schedule-a-job.md @@ -24,15 +24,16 @@ Jobs are a set of dbt commands that you want to run on a schedule. For example, As the `jaffle_shop` business gains more customers, and those customers create more orders, you will see more records added to your source data. Because you materialized the `customers` model as a table, you'll need to periodically rebuild your table to ensure that the data stays up-to-date. This update will happen when you run a job. -1. After creating your deployment environment, you should be directed to the page for new environment. If not, select **Deploy** in the upper left, then click **Jobs**. -2. Click **Create one** and provide a name, for example "Production run", and link to the Environment you just created. -3. Scroll down to "Execution Settings" and select **Generate docs on run**. -4. Under "Commands," add this command as part of your job if you don't see them: - * `dbt build` -5. For this exercise, do _not_ set a schedule for your project to run — while your organization's project should run regularly, there's no need to run this example project on a schedule. Scheduling a job is sometimes referred to as _deploying a project_. -6. Select **Save**, then click **Run now** to run your job. -7. Click the run and watch its progress under "Run history." -8. Once the run is complete, click **View Documentation** to see the docs for your project. +1. After creating your deployment environment, you should be directed to the page for a new environment. If not, select **Deploy** in the upper left, then click **Jobs**. +2. Click **Create one** and provide a name, for example, "Production run", and link to the Environment you just created. +3. Scroll down to the **Execution Settings** section. +4. Under **Commands**, add this command as part of your job if you don't see it: + * `dbt build` +5. Select the **Generate docs on run** checkbox to automatically [generate updated project docs](/docs/collaborate/build-and-view-your-docs) each time your job runs. +6. For this exercise, do _not_ set a schedule for your project to run — while your organization's project should run regularly, there's no need to run this example project on a schedule. Scheduling a job is sometimes referred to as _deploying a project_. +7. Select **Save**, then click **Run now** to run your job. +8. Click the run and watch its progress under "Run history." +9. Once the run is complete, click **View Documentation** to see the docs for your project. :::tip Congratulations 🎉! You've just deployed your first dbt project! diff --git a/website/snippets/sl-prerequisites.md b/website/snippets/sl-prerequisites.md index 09ede745431..0c100c299b0 100644 --- a/website/snippets/sl-prerequisites.md +++ b/website/snippets/sl-prerequisites.md @@ -7,7 +7,7 @@ * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. - Set up the Discovery API in the integrated tool to import metric definitions * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
      +- Recommended - Review the dbt metrics page
      @@ -20,7 +20,7 @@ * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. - Set up the Discovery API in the integrated tool to import metric definitions * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
      +- Recommended - Review the dbt metrics page
      @@ -33,6 +33,6 @@ * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. - Set up the Discovery API in the integrated tool to import metric definitions * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
      +- Recommended - Review the dbt metrics page
      diff --git a/website/snippets/slack-notifications-config-steps.md b/website/snippets/slack-notifications-config-steps.md deleted file mode 100644 index e643d4c5644..00000000000 --- a/website/snippets/slack-notifications-config-steps.md +++ /dev/null @@ -1,28 +0,0 @@ -Setting up Slack notifications in dbt Cloud enables you to receive alerts in a chosen Slack channel when a job run succeeds, fails, or is cancelled. - -:::info Note -Currently, Slack notifications can only be configured by one user to one Slack channel. Additionally, you must be an admin of the Slack workspace in order to configure Slack notifications. - -If there have been changes to the user roles and you need to move ownership, please reach out to support@getdbt.com and provide the support team with the necessary information needed to make this change for you. -::: -### Setup the integration - -1. Click the gear in the top right and select **Profile**. -2. Click **Integrations** to the left. - -3. Click **Link your Slack profile** - -4. Allow dbt Labs to access the Slack workspace. If you are a member of multiple, you can select the appropriate workspace from the dropdown menu in the top right corner. - - -### Configure the notifications - -1. Click the gear in the top right and select **Account Settings**. -2. Click **Slack Notifications** to the left and click **Edit** to the right. - -3. You can find the Slack notification settings at the bottom of the page. - -### Disabling the Slack integration - -To disable the integration entirely, navigate back to the Integrations page and click **Disconnect Account** in the Slack pane. Confirm the disconnect, and the option will revert to its original state. - diff --git a/website/snippets/tutorial-document-your-models.md b/website/snippets/tutorial-document-your-models.md index dd9e1592145..9913dbcd1d7 100644 --- a/website/snippets/tutorial-document-your-models.md +++ b/website/snippets/tutorial-document-your-models.md @@ -40,7 +40,12 @@ Adding [documentation](/docs/collaborate/documentation) to your project allows y tests: - accepted_values: values: ['placed', 'shipped', 'completed', 'return_pending', 'returned'] - + - name: customer_id + tests: + - not_null + - relationships: + to: ref('stg_customers') + field: customer_id ``` diff --git a/website/src/components/author/index.js b/website/src/components/author/index.js index a8b7ad7c0ef..6b49295936d 100644 --- a/website/src/components/author/index.js +++ b/website/src/components/author/index.js @@ -4,6 +4,7 @@ import Link from '@docusaurus/Link'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import BlogLayout from '@theme/BlogLayout'; import getAllPosts from '../../utils/get-all-posts'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; function Author(props) { const { authorData } = props @@ -38,7 +39,7 @@ function Author(props) { itemType="http://schema.org/Person">
      - {name} + {name}

      {name}

      diff --git a/website/src/components/communitySpotlightCard/index.js b/website/src/components/communitySpotlightCard/index.js index 5be1179d620..08707a93dd4 100644 --- a/website/src/components/communitySpotlightCard/index.js +++ b/website/src/components/communitySpotlightCard/index.js @@ -1,6 +1,7 @@ import React from 'react' import Link from '@docusaurus/Link'; import styles from './styles.module.css'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; const SpotlightWrapper = ({ isSpotlightMember, frontMatter, children }) => { return isSpotlightMember ? ( @@ -55,13 +56,13 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) {
      {id && isSpotlightMember ? ( {title} ) : ( {title} @@ -100,7 +101,7 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) {
      )} {description && !isSpotlightMember && ( -

      {truncateText(description)}

      +

      )} {socialLinks && isSpotlightMember && socialLinks?.length > 0 && (

      @@ -137,9 +138,27 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) { // Truncate text function truncateText(str) { // Max length of string - const maxLength = 300 + let maxLength = 300 + + // Check if anchor link starts within first 300 characters + let hasLinks = false + if(str.substring(0, maxLength - 3).match(/(?:)/g) + if(linkText?.length && linkText[0]?.length) { + maxLength += linkText[0]?.length + } + } + + const substring = str.substring(0, maxLength - 3) + return str.length > maxLength - ? `${str.substring(0, maxLength - 3)}...` + ? `${substring}...` : str } diff --git a/website/src/components/communitySpotlightCard/styles.module.css b/website/src/components/communitySpotlightCard/styles.module.css index e28c3fe7b41..253a561ebea 100644 --- a/website/src/components/communitySpotlightCard/styles.module.css +++ b/website/src/components/communitySpotlightCard/styles.module.css @@ -77,8 +77,9 @@ div.spotlightMemberCard { .spotlightMemberCard { padding: 3rem 0; } - div.spotlightMemberCard { + :global(#spotlight-members-section) div.spotlightMemberCard { margin-bottom: 0; + padding-left: 0; } .spotlightMemberCard .spotlightMemberImgContainer { flex: 0 0 346px; diff --git a/website/src/components/communitySpotlightList/index.js b/website/src/components/communitySpotlightList/index.js index d91c257122d..6885f5ff2ac 100644 --- a/website/src/components/communitySpotlightList/index.js +++ b/website/src/components/communitySpotlightList/index.js @@ -8,12 +8,29 @@ import CommunitySpotlightCard from '../communitySpotlightCard' const communityTitle = 'Community spotlight' const communityDescription = "The dbt Community is where analytics engineering lives and grows, and you're a part of it! Every quarter we'll be highlighting community members in the dbt Community Spotlight. These are individuals who have gone above and beyond to contribute to the community in a variety of ways. We all see you. We appreciate you. You are awesome." +// This date determines where the 'Previously on the Spotlight" text will show. +// Any spotlight members with a 'dateCreated' field before this date +// will be under the 'Previously..' header. +const currentSpotlightDate = new Date('2023-10-31') + function CommunitySpotlightList({ spotlightData }) { const { siteConfig } = useDocusaurusContext() // Build meta title from communityTitle and docusaurus config site title const metaTitle = `${communityTitle}${siteConfig?.title ? ` | ${siteConfig.title}` : ''}` + // Split spotlight members into current and previous + let currentSpotlightMembers = [] + let previousSpotlightMembers = [] + + spotlightData?.map(member => { + if(currentSpotlightDate > new Date(member?.data?.dateCreated)) { + previousSpotlightMembers.push(member) + } else { + currentSpotlightMembers.push(member) + } + }) + return ( @@ -31,11 +48,19 @@ function CommunitySpotlightList({ spotlightData }) { />
      - {spotlightData && spotlightData.length > 0 ? ( + {currentSpotlightMembers?.length || previousSpotlightMembers?.length ? ( <> - {spotlightData.map((member, i) => ( + {currentSpotlightMembers?.map((member, i) => ( ))} + {previousSpotlightMembers?.length ? ( + <> +

      Previously on the Spotlight

      + {previousSpotlightMembers.map((member, i) => ( + + ))} + + ) : ''} ) :

      No community spotlight members are available at this time. 😕

      diff --git a/website/src/components/discourse/index.js b/website/src/components/discourse/index.js index 18e4d3e7254..97ef08a5272 100644 --- a/website/src/components/discourse/index.js +++ b/website/src/components/discourse/index.js @@ -38,10 +38,8 @@ export const DiscourseFeed = ({ setLoading(true) setIsError(false) - // Build Netlify Function endpoint - const endpoint = window?.location?.hostname?.includes('localhost') - ? 'http://localhost:8888/.netlify/functions/get-discourse-topics' - : '/.netlify/functions/get-discourse-topics' + // Build function endpoint + const endpoint = `/api/get-discourse-topics` // If 'after' prop not passed in, set relative after date let afterDate = after diff --git a/website/src/components/discourseBlogComments/index.js b/website/src/components/discourseBlogComments/index.js index 63279285f2a..7684269f92a 100644 --- a/website/src/components/discourseBlogComments/index.js +++ b/website/src/components/discourseBlogComments/index.js @@ -28,10 +28,8 @@ export const DiscourseBlogComments = ({title,slug}) => { const fetchData = async () => { try { - const endpoint = window?.location?.hostname?.includes('localhost') - ? `http://localhost:8888/.netlify/functions/get-discourse-comments?title=${title}&slug=${slug}` - : `/.netlify/functions/get-discourse-comments?title=${title}&slug=${slug}` - + const endpoint = `/api/get-discourse-comments?title=${title}&slug=${slug}` + const { data } = await axios.get(endpoint) // Set error state if data not available diff --git a/website/src/components/lightbox/index.js b/website/src/components/lightbox/index.js index b4c2da3c905..1c748bbb04f 100644 --- a/website/src/components/lightbox/index.js +++ b/website/src/components/lightbox/index.js @@ -1,5 +1,6 @@ import React from 'react'; import styles from './styles.module.css'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; function Lightbox({ src, @@ -35,7 +36,7 @@ function Lightbox({ data-toggle="lightbox" alt={alt ? alt : title ? title : ''} title={title ? title : ''} - src={src} + src={imageCacheWrapper(src)} /> diff --git a/website/src/components/lineage/index.js b/website/src/components/lineage/index.js index eb59178369d..6c22e2bae99 100644 --- a/website/src/components/lineage/index.js +++ b/website/src/components/lineage/index.js @@ -5,11 +5,11 @@ let Dag = null; try { /** As a private package, not every developer will have access to this repo. */ - const DagImport = require('@dbt-labs/react-dbt-dag'); - require('@dbt-labs/react-dbt-dag/dag.css'); - require('@dbt-labs/react-dbt-dag/dag.standalone.css'); + // const DagImport = require('@dbt-labs/react-dbt-dag'); + // require('@dbt-labs/react-dbt-dag/dag.css'); + // require('@dbt-labs/react-dbt-dag/dag.standalone.css'); - Dag = DagImport.Dag; + // Dag = DagImport.Dag; } catch (err) { /** * react-dbt-dag is a private repo. Not all developers of the diff --git a/website/src/components/quickstartGuideCard/index.js b/website/src/components/quickstartGuideCard/index.js index fdc629bd7b0..104bb5cb35b 100644 --- a/website/src/components/quickstartGuideCard/index.js +++ b/website/src/components/quickstartGuideCard/index.js @@ -3,26 +3,67 @@ import Link from "@docusaurus/Link"; import styles from "./styles.module.css"; import getIconType from "../../utils/get-icon-type"; -function QuickstartGuideCard({ frontMatter }) { - const { id, title, time_to_complete, icon } = frontMatter; +export default function QuickstartGuideCard({ frontMatter }) { + const { id, title, time_to_complete, icon, tags, level, recently_updated } = + frontMatter; + return ( - + + {recently_updated && ( + Updated + )} {icon && getIconType(icon, styles.icon)} - +

      {title}

      {time_to_complete && ( {time_to_complete} )} - - Start + + Start + + {(tags || level) && ( +
      + {tags && + tags.map((tag, i) => ( +
      + {tag} +
      + ))} + {level &&
      {level}
      } +
      + )} ); } -export default QuickstartGuideCard; +// Component that handles the information under the title on the quickstart guide page +export function QuickstartGuideTitle({ frontMatter }) { + const { time_to_complete, tags, level, recently_updated } = + frontMatter; + + return ( +
      + {recently_updated && ( + Updated + )} + {time_to_complete && ( + {time_to_complete} + )} + + {(tags || level) && ( +
      + {tags && + tags.map((tag, i) => ( +
      + {tag} +
      + ))} + {level &&
      {level}
      } +
      + )} +
      + ); +} diff --git a/website/src/components/quickstartGuideCard/styles.module.css b/website/src/components/quickstartGuideCard/styles.module.css index 8202f694fcd..5df40c8479e 100644 --- a/website/src/components/quickstartGuideCard/styles.module.css +++ b/website/src/components/quickstartGuideCard/styles.module.css @@ -1,24 +1,28 @@ .quickstartCard { - border: 1px solid #EFF2F3; - border-radius: var(--border-radius); + outline: 1px solid #EFF2F3; + border-radius: 10px; box-shadow: 0px 11px 24px rgba(138, 138, 138, .1); padding: 2.5rem 2.5rem 1.5rem 2.5rem; flex: 0 0 30%; - border-bottom: solid 4px var(--color-light-teal); display: flex; flex-direction: column; text-decoration: none !important; transition: all 0.2s ease-in-out; + position: relative; } .quickstartCard:hover { - border-bottom-color: var(--color-orange); transform: translateY(-7px); + outline: 2px solid var( --color-green-blue); +} + +.quickstartCard:hover > .start { + text-decoration: underline; } .quickstartCard .icon { - max-width: 25px; - font-size: 25px; + max-width: 46px; + font-size: 46px; margin-bottom: .8rem; color: var(--ifm-menu-color); } @@ -45,21 +49,106 @@ color:var(--ifm-menu-color) } +[data-theme='dark'] .quickstartCard .recently_updated { + color: #fff; +} + .quickstartCard .start { font-size: 1.125rem; margin-top: auto; padding-top: 2rem; + font-weight: 600; } [data-theme='dark'] .quickstartCard .start { color: #fff; } -[data-theme='dark'] .quickstartCard:hover .start { - text-decoration: underline; +.quickstartCard .start i { + margin-left: 4px; + font-size: .9rem; +} + +.quickstartCard .recently_updated { + position: absolute; + top: 1.5rem; + right: 1.5rem; +} + +.quickstartCard .tag_container { + display: flex; + flex-wrap: wrap; + gap: 0.375rem; + margin-top: 1rem; +} + +.quickstartCard .tag_container .tag { + background: #E5E7EB; + border-radius: 1.5rem; + color:#262A38; + padding: 0rem 0.75rem; +} + +[data-theme='dark'] .quickstartCard .tag_container .tag { + background: #374151; + color: #fff; +} + +.infoContainer { + display: flex; + margin-bottom: 4rem; +} + +.infoContainer > * { + border-left: solid #e0e3e8 3px; + padding: 0 1rem 0 1rem; +} + +.infoContainer > *:first-child { + border: none; + padding-left: 0; +} + +.infoContainer .tag_container { + display: flex; + flex-wrap: wrap; + gap: 0.375rem; + align-items: center; +} + +.infoContainer .tag_container .tag { + background: #E5E7EB; + border-radius: 1.5rem; + color:#262A38; + padding: 0rem 0.75rem; +} + +[data-theme='dark'] .infoContainer .tag_container .tag { + background: #374151; + color: #fff; +} + + +.infoContainer .time_to_complete { + font-weight: 700; + +} + +.infoContainer .recently_updated { + color: var(--color-green-blue); +} + +[data-theme='dark'] .infoContainer .recently_updated { + color: #fff; } -.quickstartCard .start:after { - content: " →"; - margin-left: 5px; +@media (max-width: 996px) { + .infoContainer { + gap: 1rem; + flex-direction: column; + } + .infoContainer > * { + border: none; + padding: 0; + } } diff --git a/website/src/components/quickstartGuideList/index.js b/website/src/components/quickstartGuideList/index.js index 954d54e6d47..05c8c041a0e 100644 --- a/website/src/components/quickstartGuideList/index.js +++ b/website/src/components/quickstartGuideList/index.js @@ -1,19 +1,68 @@ import React from 'react'; +import { useState, useEffect, useMemo } from 'react'; import Head from '@docusaurus/Head'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import Layout from '@theme/Layout'; import Hero from '@site/src/components/hero'; import QuickstartGuideCard from '../quickstartGuideCard' import styles from './styles.module.css'; +import { SelectDropdown } from '../selectDropdown'; +import SearchInput from '../searchInput'; -const quickstartTitle = 'Quickstarts' +const quickstartTitle = 'Guides' const quickstartDescription = 'dbt Core is a powerful open-source tool for data transformations and dbt Cloud is the fastest and most reliable way to deploy your dbt jobs. With the help of a sample project, learn how to quickly start using dbt and one of the most common data platforms.' + function QuickstartList({ quickstartData }) { - const { siteConfig } = useDocusaurusContext() - + const { siteConfig } = useDocusaurusContext(); + const [filteredData, setFilteredData] = useState(() => quickstartData); + const [selectedTags, setSelectedTags] = useState([]); + const [selectedLevel, setSelectedLevel] = useState([]); + const [searchInput, setSearchInput] = useState(''); + // Build meta title from quickstartTitle and docusaurus config site title - const metaTitle = `${quickstartTitle}${siteConfig?.title ? ` | ${siteConfig.title}` : ''}` + const metaTitle = `${quickstartTitle}${siteConfig?.title ? ` | ${siteConfig.title}` : ''}`; + + // UseMemo to prevent re-rendering on every filter change + // Get tag options + // Populated from the tags frontmatter array + const tagOptions = useMemo(() => { + const tags = new Set(); + quickstartData.forEach(guide => + guide?.data?.tags?.forEach(tag => tags.add(tag)) + ); + // Sort alphabetically + return Array.from(tags).sort((a, b) => a.toLowerCase().localeCompare(b.toLowerCase())).map(tag => ({ value: tag, label: tag })); + }, [quickstartData]); + + // Get level options + // Populated by the level frontmatter string + const levelOptions = useMemo(() => { + const levels = new Set(); + quickstartData.forEach(guide => + guide?.data?.level && levels.add(guide.data.level) + ); + return Array.from(levels).map(level => ({ value: level, label: level })); + }, [quickstartData]); + + // Handle all filters + const handleDataFilter = () => { + const filteredGuides = quickstartData.filter((guide) => { + const tagsMatch = selectedTags.length === 0 || (Array.isArray(guide?.data?.tags) && selectedTags.every((tag) => + guide?.data?.tags.includes(tag.value) + )); + const levelMatch = selectedLevel.length === 0 || (guide?.data?.level && selectedLevel.some((level) => + guide?.data?.level === level.value + )); + const titleMatch = searchInput === '' || guide?.data?.title?.toLowerCase().includes(searchInput.toLowerCase()); + return tagsMatch && levelMatch && titleMatch; + }); + setFilteredData(filteredGuides); + }; + + useEffect(() => { + handleDataFilter(); + }, [selectedTags, selectedLevel, searchInput]); return ( @@ -22,23 +71,32 @@ function QuickstartList({ quickstartData }) { -
      -
      - {quickstartData && quickstartData.length > 0 ? ( +
      + {tagOptions && tagOptions.length > 0 && ( + + )} + {levelOptions && levelOptions.length > 0 && ( + + )} + setSearchInput(value)} placeholder='Search Guides' /> +
      +
      + {filteredData && filteredData.length > 0 ? ( <> - {quickstartData.map((guide, i) => ( - + {filteredData.map((guide) => ( + ))} - ) : -

      No quickstarts are available at this time. 😕

      + ) : +

      No quickstarts are available with the selected filters.

      }
      @@ -46,4 +104,4 @@ function QuickstartList({ quickstartData }) { ) } -export default QuickstartList +export default QuickstartList; diff --git a/website/src/components/quickstartGuideList/styles.module.css b/website/src/components/quickstartGuideList/styles.module.css index 8c4e45edc8c..4e1518efd2b 100644 --- a/website/src/components/quickstartGuideList/styles.module.css +++ b/website/src/components/quickstartGuideList/styles.module.css @@ -18,12 +18,23 @@ .quickstartCardContainer { display: grid; grid-template-columns: 1fr 1fr 1fr; - grid-gap: 2rem; - padding: 5rem 1rem; + grid-gap: 1rem; + padding: 2rem 1rem 5rem; +} + +.quickstartFilterContainer { + display: grid; + grid-template-columns: 1fr 1fr 1fr; + grid-gap: 1rem; + padding-top: 4rem; +} + +.quickstartFilterContainer > div:first-child { + padding: 0; } @media (max-width: 996px) { - .quickstartCardContainer { + .quickstartCardContainer, .quickstartFilterContainer { grid-template-columns: 1fr; } } diff --git a/website/src/components/quickstartTOC/index.js b/website/src/components/quickstartTOC/index.js index 49209273964..3ff5e027208 100644 --- a/website/src/components/quickstartTOC/index.js +++ b/website/src/components/quickstartTOC/index.js @@ -26,16 +26,6 @@ function QuickstartTOC() { const steps = quickstartContainer.querySelectorAll("h2"); const snippetContainer = document.querySelectorAll(".snippet"); - // Add snippet container to its parent step - snippetContainer.forEach((snippet) => { - const parent = snippet?.parentNode; - while (snippet?.firstChild && parent.className) { - if (parent) { - parent.insertBefore(snippet.firstChild, snippet); - } - } - }); - // Create an array of objects with the id and title of each step const data = Array.from(steps).map((step, index) => ({ id: step.id, @@ -49,6 +39,16 @@ function QuickstartTOC() { // Wrap all h2 (steps), along with all of their direct siblings, in a div until the next h2 if (mounted) { + // Add snippet container to its parent step + snippetContainer.forEach((snippet) => { + const parent = snippet?.parentNode; + while (snippet?.firstChild && parent.className) { + if (parent) { + parent.insertBefore(snippet.firstChild, snippet); + } + } + }); + steps.forEach((step, index) => { const wrapper = document.createElement("div"); wrapper.classList.add(style.stepWrapper); @@ -81,13 +81,19 @@ function QuickstartTOC() { buttonContainer.classList.add(style.buttonContainer); const prevButton = document.createElement("a"); const nextButton = document.createElement("a"); + const nextButtonIcon = document.createElement("i"); + const prevButtonIcon = document.createElement("i"); + prevButtonIcon.classList.add("fa-regular", "fa-arrow-left"); prevButton.textContent = "Back"; + prevButton.prepend(prevButtonIcon); prevButton.classList.add(clsx(style.button, style.prevButton)); prevButton.disabled = index === 0; prevButton.addEventListener("click", () => handlePrev(index + 1)); + nextButtonIcon.classList.add("fa-regular", "fa-arrow-right"); nextButton.textContent = "Next"; + nextButton.appendChild(nextButtonIcon); nextButton.classList.add(clsx(style.button, style.nextButton)); nextButton.disabled = index === stepWrappers.length - 1; nextButton.addEventListener("click", () => handleNext(index + 1)); @@ -190,7 +196,24 @@ function QuickstartTOC() { updateStep(activeStep, stepNumber); }; + // Handle TOC menu click + const handleTocMenuClick = () => { + const tocList = document.querySelector(`.${style.tocList}`); + const tocMenuBtn = document.querySelector(`.${style.toc_menu_btn}`); + const tocListStyles = window.getComputedStyle(tocList); + + if (tocListStyles.display === "none") { + tocList.style.display = "block"; + tocMenuBtn.querySelector("i").style.transform = "rotate(0deg)"; + } else { + tocList.style.display = "none"; + tocMenuBtn.querySelector("i").style.transform = "rotate(-90deg)"; + } + }; + return ( + <> + Menu
        {tocData.map((step) => (
      • ))}
      + ); } diff --git a/website/src/components/quickstartTOC/styles.module.css b/website/src/components/quickstartTOC/styles.module.css index edfd0380098..892e6f73be6 100644 --- a/website/src/components/quickstartTOC/styles.module.css +++ b/website/src/components/quickstartTOC/styles.module.css @@ -1,5 +1,5 @@ .quickstartTitle { - padding: 1rem 0 2rem; + } .tocList { @@ -8,15 +8,16 @@ margin: 0; width: 370px; flex-shrink: 0; - padding-right: 3rem; + padding-right: 4rem; + margin-right: 4rem; + border-right: solid 4px #EFF2F3; } .tocList li { padding: 1rem; display: block; - border: 1px solid #EFF2F3; - box-shadow: 0px 11px 24px rgba(138, 138, 138, 0.1), 0px 0px 0px rgba(138, 138, 138, 0.1); - border-radius: 10px; + box-shadow: 0px 10px 16px 0px rgba(31, 41, 55, 0.10); + border-radius: 8px; margin-bottom: 1rem; display: grid; grid-template-columns: 1fr 5fr; @@ -32,13 +33,13 @@ height: 30px; text-align: center; line-height: 27px; - color: var(--color-light-teal); - border: solid 1px var(--color-light-teal); + color: var(--color-green-blue); + border: solid 1px var(--color-green-blue); margin-bottom: auto; } .tocList .active span { - background: var(--color-light-teal); + background: var( --color-green-blue); color: var(--color-white); } @@ -52,7 +53,7 @@ html[data-theme="dark"] .tocList li span { } html[data-theme="dark"] .tocList .active span { - border-color: var(--color-light-teal); + border-color: var(--color-green-blue); } .tocItem { @@ -73,28 +74,47 @@ html[data-theme="dark"] .tocList .active span { transition-property: color, background, border-color; transition-duration: var(--ifm-button-transition-duration); transition-timing-function: var(--ifm-transition-timing-default); - border: 2px solid var(--color-light-teal); + border: 2px solid var(--color-green-blue); + color: var(--color-green-blue); border-radius: 5px; width: 125px; text-align: center; } -.buttonContainer a:hover { - background: var(--color-light-teal); - color: var(--color-white) +.stepWrapper .buttonContainer a:hover { + background: var(--color-green-blue); + color: var(--color-white); +} + +html[data-theme="dark"] .stepWrapper .buttonContainer a:hover { + color: var(--color-white) !important; } .buttonContainer .prevButton { margin-right: auto; } +.buttonContainer .prevButton i { + font-size: .8rem; + margin-right: .4rem; +} + .buttonContainer .nextButton { margin-left: auto; } -.stepWrapper[data-step="1"] .nextButton { - background: var(--color-light-teal); - color: var(--color-white) +.buttonContainer .nextButton i { + font-size: .8rem; + margin-left: .4rem; +} + +.stepWrapper[data-step="1"] a.nextButton { + background: var(--color-green-blue); + color: var(--color-white); +} + +html[data-theme="dark"] .stepWrapper[data-step="1"] a.nextButton { + color: var(--color-white) !important; } .stepWrapper.hidden { @@ -105,12 +125,26 @@ html[data-theme="dark"] .tocList .active span { display: none; } +.toc_menu_btn { + display: none; +} + +.toc_menu_btn i { + transform: rotate(-90deg); + vertical-align: middle; +} + @media (max-width: 996px) { .tocList { width: 100%; padding-right: 0; margin-bottom: 2rem; - height: 160px; - overflow-y: auto; + display: none; + } + + .toc_menu_btn { + display: inline-block; + margin-bottom: 2rem; + cursor: pointer; } } diff --git a/website/src/components/searchInput/index.js b/website/src/components/searchInput/index.js new file mode 100644 index 00000000000..e0a5faf4a82 --- /dev/null +++ b/website/src/components/searchInput/index.js @@ -0,0 +1,26 @@ +import React from "react"; +import styles from "./styles.module.css"; + +const SearchInput = ({ + value, + onChange, + placeholder = "Search...", + ...props +}) => { + return ( + + ); +}; + +export default SearchInput; diff --git a/website/src/components/searchInput/styles.module.css b/website/src/components/searchInput/styles.module.css new file mode 100644 index 00000000000..ae19a3bb81b --- /dev/null +++ b/website/src/components/searchInput/styles.module.css @@ -0,0 +1,30 @@ +.inputContainer { + padding: 0 1rem; + border-radius: 0.3125rem; + border: 2px solid var(--navy-200-c-6-ccd-4, #C6CCD4); + + +} + +.inputContainer:active, .input:focus { + border: 2px solid #4f5d75; + outline: none; +} + +.input::placeholder { + all: unset; + -webkit-text-security: initial; +} + +.inputContainer .input { + border: none; + min-height: 38px; + font-size: .975rem; + color: var(--ifm-font-color-base); + font-family: var(--ifm-font-family-base); +} + +[data-theme='dark'] .input{ + background: #1b1b1d; + color: #e3e3e3; +} diff --git a/website/src/components/selectDropdown/index.js b/website/src/components/selectDropdown/index.js new file mode 100644 index 00000000000..b6378518c25 --- /dev/null +++ b/website/src/components/selectDropdown/index.js @@ -0,0 +1,21 @@ +import React from "react"; +import Select from "react-select"; +import styles from "./styles.module.css"; + +export const SelectDropdown = ({ options, value, onChange, isMulti, placeHolder }) => { + return ( +
      Google BigQuery ❌ +