diff --git a/.github/ISSUE_TEMPLATE/improve-docs.yml b/.github/ISSUE_TEMPLATE/a-improve-docs.yml similarity index 94% rename from .github/ISSUE_TEMPLATE/improve-docs.yml rename to .github/ISSUE_TEMPLATE/a-improve-docs.yml index 57dc64cc312..c9030bc227b 100644 --- a/.github/ISSUE_TEMPLATE/improve-docs.yml +++ b/.github/ISSUE_TEMPLATE/a-improve-docs.yml @@ -5,7 +5,7 @@ body: - type: markdown attributes: value: | - * You can ask questions or submit ideas for the dbt docs in [Discussions](https://github.com/dbt-labs/docs.getdbt.com/discussions) + * You can ask questions or submit ideas for the dbt docs in [Issues](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). @@ -39,4 +39,4 @@ body: label: Additional information description: Add any other context or screenshots about the feature request here. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9349000f66b..f3a3521bdec 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,5 @@ blank_issues_enabled: true contact_links: - - name: Want to see new content? Open a discussion! - url: https://github.com/dbt-labs/docs.getdbt.com/discussions/new - about: You can open a discussion to propose new content for the dbt product documentation. - name: Have questions about dbt? Join the Community! url: https://www.getdbt.com/community/join-the-community about: You can join the dbt Labs Community to ask and answer questions. diff --git a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml index f138b9e4e06..037da98dc6f 100644 --- a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml +++ b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml @@ -1,4 +1,4 @@ -name: Contribute to the dbt Developer Blog +name: Propose a dbt Developer Blog idea description: > For proposing a new post on the dbt Developer Blog. labels: ["content","developer blog"] diff --git a/.github/ISSUE_TEMPLATE/improve-the-site.yml b/.github/ISSUE_TEMPLATE/improve-the-site.yml index e0556d7374f..01ebdea711a 100644 --- a/.github/ISSUE_TEMPLATE/improve-the-site.yml +++ b/.github/ISSUE_TEMPLATE/improve-the-site.yml @@ -1,11 +1,11 @@ -name: Improve the docs.getdbt.com site -description: Make a suggestion or report a problem about the technical implementation of docs.getdbt.com. -labels: ["engineering"] +name: Report a docs.getdbt.com site issue +description: Report a problem about the technical implementation of docs.getdbt.com. +labels: ["engineering","bug"] body: - type: markdown attributes: value: | - * You can ask questions or submit ideas for the dbt docs in [Discussions](https://github.com/dbt-labs/docs.getdbt.com/discussions) + * You can ask questions or submit ideas for the dbt docs in [Issues](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). @@ -39,4 +39,4 @@ body: label: Additional information description: Any additional information, configuration, or data that might be necessary to reproduce the issue. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml b/.github/ISSUE_TEMPLATE/new-dbt-feature.yml deleted file mode 100644 index fa46a189fc4..00000000000 --- a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Start docs project for a new feature -description: For dbt PMs to add docs for their new or updated dbt product features. -labels: ["content","upcoming release"] -body: - - type: markdown - attributes: - value: | - * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). - * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). - - - type: checkboxes - id: contributions - attributes: - label: Contributions - description: This applies to new, unreleased content. - options: - - label: I am a PM or subject matter expert at dbt who is responsible for this feature. - - - type: textarea - attributes: - label: Where does this content belong? - description: | - - Give as much detail as you can to help us understand where you expect the content to live. - validations: - required: true - - - type: textarea - attributes: - label: Link to source material - description: | - Use the [source material template](https://docs.google.com/document/d/1lLWGMXJFjkY4p7r8ZKhBX73dOLmIjgXZBYq39LqmAJs/edit) to provide source material for this feature. - validations: - required: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml new file mode 100644 index 00000000000..e19accf6ebb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml @@ -0,0 +1,62 @@ +name: Add adapter to Trusted list +description: For adapter maintainers who wish to have theirs added to the list of Trusted adapters. +title: "Trust dbt-myadapter" +labels: ["adapter maintainers"] +assignees: + - dataders +body: + - type: markdown + attributes: + value: | + We're excited that you'd like to support your adapter formally as "Trusted"! This template will ensure that you are aware of the process and the guidelines. Additionally, that you can vouch that your adapter currently meets the standards of a Trusted adapter. For more information, see [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) + + - type: input + id: adapter-repo + attributes: + label: Link to adapter repo + description: Please link to the GitHub repo + validations: + required: true + + - type: input + id: contact + attributes: + label: Contact Details + description: How can we get in touch with you? + placeholder: your preferred email and/or dbt Slack handle + validations: + required: true + + - type: dropdown + id: author_type + attributes: + label: Which of these best describes you? + options: + - I am a dbt Community member + - I work for the vendor on top of which the dbt adapter functions + validations: + required: true + + - type: checkboxes + id: read-program-guide + attributes: + label: Please agree to the each of the following + options: + - label: I am a maintainer of the adapter being submited for Trusted status + required: true + - label: I have read both the [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) and [Building a Trusted Adapter](https://docs.getdbt.com/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter) pages. + required: true + - label: I believe that the adapter currently meets the expectations given above + required: true + - label: I will ensure this adapter stays in compliance with the guidelines + required: true + - label: I understand that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met + required: true + + - type: textarea + id: icon + attributes: + label: What icon should be used? + description: | + Please share an svg image that you'd like to be displayed in for your adapter. Normally, this is the logo for the data platform on top of which your adapter works. If there's a dark mode version, please also share that. + Pasting the image from your clipboard will upload the file to GitHub and create markdown formatting for it to be rendered inline diff --git a/.github/labeler.yml b/.github/labeler.yml index 176f1874009..316098eb51c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,6 +3,7 @@ developer blog: guides: - website/docs/guides/**/* +- website/docs/quickstarts/**/* content: - website/docs/**/* diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 90f4938d2cb..309872dd818 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,9 +10,10 @@ To learn more about the writing conventions used in the dbt Labs docs, see the [ -- [ ] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) and [About versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) so my content adheres to these guidelines. +- [ ] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. +- [ ] For [docs versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#about-versioning), review how to [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content). - [ ] Add a checklist item for anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." Adding new pages (delete if not applicable): @@ -22,4 +23,4 @@ Adding new pages (delete if not applicable): Removing or renaming existing pages (delete if not applicable): - [ ] Remove page from `website/sidebars.js` - [ ] Add an entry `website/static/_redirects` -- [ ] [Ran link testing](https://github.com/dbt-labs/docs.getdbt.com#running-the-cypress-tests-locally) to update the links that point to the deleted page +- [ ] Run link testing locally with `npm run build` to update the links that point to the deleted page diff --git a/.github/workflows/asana-connection.yml b/.github/workflows/asana-connection.yml new file mode 100644 index 00000000000..aced477bdac --- /dev/null +++ b/.github/workflows/asana-connection.yml @@ -0,0 +1,17 @@ +name: Show PR Status in Asana +on: + pull_request: + types: [opened, reopened] + +jobs: + create-asana-attachment-job: + runs-on: ubuntu-latest + name: Create pull request attachments on Asana tasks + steps: + - name: Create pull request attachments + uses: Asana/create-app-attachment-github-action@latest + id: postAttachment + with: + asana-secret: ${{ secrets.ASANA_SECRET }} + - name: Log output status + run: echo "Status is ${{ steps.postAttachment.outputs.status }}" diff --git a/.github/workflows/autogenerated_labeler.yml b/.github/workflows/autogenerated_labeler.yml new file mode 100644 index 00000000000..e6aab0492b8 --- /dev/null +++ b/.github/workflows/autogenerated_labeler.yml @@ -0,0 +1,40 @@ +# **what?** +# Labels issues autogenerated in dbt-core + +# **why?** +# To organize autogenerated issues from dbt-core to make it easier to find and track them. + +# **when?** +# When an issue is opened by the FishtownBuildBot + +name: Add Labels to Autogenerated Issues + +on: + issues: + types: [opened] + +jobs: + add_customized_labels: + if: github.event.issue.user.login == 'FishtownBuildBot' + permissions: + issues: write + + runs-on: ubuntu-latest + steps: + - name: "Determine appropriate labels by repo in title" + id: repo + env: + ISSUE_TITLE: ${{ github.event.issue.title }} + run: | + if [[ "$ISSUE_TITLE" == *"dbt-core"* ]]; then + echo "labels='content,improvement,dbt Core'" >> $GITHUB_OUTPUT + else + echo "labels='content,improvement,adapters'" >> $GITHUB_OUTPUT + fi + + - name: "Add Labels to autogenerated Issues" + id: add-labels + run: | + gh issue edit ${{ github.event.issue.number }} --repo ${{ github.repository }} --add-label ${{ steps.repo.outputs.labels }} + env: + GH_TOKEN: ${{ secrets.DOCS_SECRET }} diff --git a/.github/workflows/crawler.yml b/.github/workflows/crawler.yml new file mode 100644 index 00000000000..6bfce5321c5 --- /dev/null +++ b/.github/workflows/crawler.yml @@ -0,0 +1,33 @@ +name: Algolia Crawler +on: + pull_request: + types: + - closed + +jobs: + algolia_recrawl: + # Comment out the if check below if running on every merge to current branch + if: | + contains(github.event.pull_request.labels.*.name, 'trigger-crawl') + && github.event.pull_request.merged == true + name: Trigger Algolia Crawl + runs-on: ubuntu-latest + steps: + # Checkout repo + - name: Checkout Repo + uses: actions/checkout@v3 + + # Wait 8 minutes to allow Vercel build to complete + - run: sleep 480 + + # Once deploy URL is found, trigger Algolia crawl + - name: Run Algolia Crawler + uses: algolia/algoliasearch-crawler-github-actions@v1 + id: crawler_push + with: + crawler-user-id: ${{ secrets.CRAWLER_USER_ID }} + crawler-api-key: ${{ secrets.CRAWLER_API_KEY }} + algolia-app-id: ${{ secrets.ALGOLIA_APP_ID }} + algolia-api-key: ${{ secrets.ALGOLIA_API_KEY }} + site-url: 'https://docs.getdbt.com' + crawler-name: ${{ secrets.CRAWLER_NAME }} diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml index 5ebef4f88ca..48615e60b9e 100644 --- a/.github/workflows/label.yml +++ b/.github/workflows/label.yml @@ -2,37 +2,45 @@ name: Add/Remove Labels on: pull_request_target: - types: [ opened, closed ] + types: [opened] jobs: add_new_contributor_label: if: github.event.action == 'opened' - permissions: - contents: read - pull-requests: write runs-on: ubuntu-latest steps: - - uses: actions/github-script@v6 - with: - script: | - const creator = context.payload.sender.login + - name: Add new contributor label + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.DOCS_SECRET }} + script: | + const creator = context.payload.sender.login; const opts = github.rest.issues.listForRepo.endpoint.merge({ ...context.issue, creator, - state: 'all' - }) - const issues = await github.paginate(opts) + state: 'all', + }); + + const issues = await github.paginate(opts); + + let isAlreadyContributor = false; + for (const issue of issues) { if (issue.number === context.issue.number) { - continue + continue; } - if (issue.pull_request) { - return // creator is already a contributor + if (issue.pull_request && issue.user.login === creator) { + isAlreadyContributor = true; + break; } } - await github.rest.issues.addLabels({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - labels: ['new contributor'] - }) + + if (!isAlreadyContributor) { + console.log('Adding label: new contributor'); + await github.rest.issues.addLabels({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['new contributor'], + }); + } diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 7e4bb5c268a..cc231cdcde3 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -5,8 +5,8 @@ name: "Pull Request Labeler" on: -- pull_request_target - + pull_request_target: + types: [opened] jobs: triage: permissions: diff --git a/.gitignore b/.gitignore index b2746893814..74d338484aa 100755 --- a/.gitignore +++ b/.gitignore @@ -11,10 +11,14 @@ website/yarn.lock website/node_modules website/i18n/* -# Local vs code +# IDE configs .vscode +.idea + # Local Netlify folder .netlify -.vscode .eslintcache + +# Local Vercel folder +.vercel diff --git a/README.md b/README.md index 4dfd8a8be9e..c749fedf95a 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Creating an inclusive and equitable environment for our documents is more import We welcome contributions from community members to this repo: - **Fixes**: When you notice an error, you can use the `Edit this page` button at the bottom of each page to suggest a change. - **New documentation**: If you contributed code in [dbt-core](https://github.com/dbt-labs/dbt-core), we encourage you to also write the docs here! Please reach out in the dbt community if you need help finding a place for these docs. -- **Major rewrites**: You can [file an issue](https://github.com/dbt-labs/docs.getdbt.com/issues/new?assignees=&labels=content%2Cimprovement&template=improve-docs.yml) or [start a discussion](https://github.com/dbt-labs/docs.getdbt.com/discussions) to propose ideas for a content area that requires attention. +- **Major rewrites**: You can [file an issue](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to propose ideas for a content area that requires attention. You can use components documented in the [docusaurus library](https://v2.docusaurus.io/docs/markdown-features/). @@ -42,7 +42,7 @@ You can add code snippets and other content in a tabbed view. To learn more abou # Running the Docs site locally -You can click a link available in a netlify bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/): +You can click a link available in a Vercel bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/): ## Prerequisites diff --git a/contributing/adding-page-components.md b/contributing/adding-page-components.md index 751f7c1f6c1..a07d0ff02e4 100644 --- a/contributing/adding-page-components.md +++ b/contributing/adding-page-components.md @@ -1,6 +1,6 @@ ## Using warehouse components -You can use the following components to provide code snippets for each supported warehouse. You can see a real-life example in the docs page [Initialize your project](/quickstarts/databricks?step=6). +You can use the following components to provide code snippets for each supported warehouse. You can see a real-life example in the docs page [Initialize your project](/guides/databricks?step=6). Identify code by labeling with the warehouse names: diff --git a/contributing/content-style-guide.md b/contributing/content-style-guide.md index eaa090a00b6..0d2bf243d45 100644 --- a/contributing/content-style-guide.md +++ b/contributing/content-style-guide.md @@ -229,7 +229,7 @@ When referring to different sections of the IDE, use the name of the section and People make use of titles in many places like table headers, section headings (such as an H2, H3, or H4), page titles, sidebars, and so much more. -When generating titles or updating them, use sentence case. It sets a more conversational tone to the docs—making the content more approachable and creating a friendly feel. +When generating titles or updating them, use sentence case. It sets a more conversational tone to the docs— making the content more approachable and creating a friendly feel. We've defined five content types you can use when contributing to the docs (as in, writing or authoring). Learn more about title guidelines for [each content type](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-types.md). @@ -239,7 +239,7 @@ Placeholder text is something that the user should replace with their own text. Use all capital letters([screaming snake case](https://fission.codes/blog/screaming-snake-case/)) to indicate text that changes in the user interface or that the user needs to supply in a command or code snippet. Avoid surrounding it in brackets or braces, which someone might copy and use, producing an error. -Identify what the user should replace the placeholder text with in the paragraph preceding the code snippet or command. +Identify what the user should replace the placeholder text within the paragraph preceding the code snippet or command. :white_check_mark: The following is an example of configuring a connection to a Redshift database. In your YAML file, you must replace `CLUSTER_ID` with the ID assigned to you during setup: @@ -276,7 +276,7 @@ Guidelines for making lists are: - There are at least two items. - All list items follow a consistent, grammatical structure (like each item starts with a verb, each item begins with a capitalized word, each item is a sentence fragment). - Lists items don't end in commas, semicolons, or conjunctions (like "and", "or"). However, you can use periods if they’re complete sentences. -- Introduce the list with a heading or, if it's within text, as a complete sentence or as a sentence fragment followed by a colon. +- Introduce the list with a heading or, if it's within the text, as a complete sentence or as a sentence fragment followed by a colon. If the list starts getting lengthy and dense, consider presenting the same content in a different format such as a table, as separate subsections, or a new guide. @@ -286,7 +286,7 @@ A bulleted list with introductory text: > A dbt project is a directory of `.sql` and .yml` files. The directory must contain at a minimum: > -> - Models: A model is a single `.sql` file. Each model contains a single `select` statement that either transforms raw data into a dataset that is ready for analytics, or, more often, is an intermediate step in such a transformation. +> - Models: A model is a single `.sql` file. Each model contains a single `select` statement that either transforms raw data into a dataset that is ready for analytics or, more often, is an intermediate step in such a transformation. > - A project file: A `dbt_project.yml` file, which configures and defines your dbt project. A bulleted list with sentence fragments: @@ -307,10 +307,10 @@ A numbered list following an H2 heading: ## Tables Tables provide a great way to present complex information and can help the content be more scannable for users, too. -There are many ways to construct a table, like row spanning and cell splitting. Make sure the content is clear, concise, and presents well on the web page (like avoid awkward word wrapping). +There are many ways to construct a table, such as row spanning and cell splitting. The content should be clear, concise, and presented well on the web page (for example, avoid awkward word wrapping). Guidelines for making tables are: -- Introduce the table with a heading or, if it's within text, as a complete sentence or as a sentence fragment followed by a colon. +- Introduce the table with a heading or, if it's within the text, as a complete sentence or as a sentence fragment followed by a colon. - Use a header row - Use sentence case for all content, including the header row - Content can be complete sentences, sentence fragments, or single words (like `Currency`) @@ -338,7 +338,7 @@ A table following an H3 heading: > | Name | Description | Values | > | -----| ----------- | ------ | > | `-help` | Displays information on how to use the command. | Doesn't take any values. | -> | `-readable` | Print output in human readable format. |
dbt
. This can create path conflicts if your operating system selects one over the other based on your $PATH environment variable (settings).pip3 install dbt
[pip](/docs/cloud/cloud-cli-installation?install=pip#install-dbt-cloud-cli) command.
+2. Install natively, ensuring you either deactivate the virtual environment containing dbt Core or create an alias for the dbt Cloud CLI.
+3. (Advanced users) Install natively, but modify the $PATH environment variable to correctly point to the dbt Cloud CLI binary to use both dbt Cloud CLI and dbt Core together.
+
+You can always uninstall the dbt Cloud CLI to return to using dbt Core.
+~/.bashrc
, ~/.bash_profile
, ~/.zshrc
, or another file.alias dbt-cloud="path_to_dbt_cloud_cli_binary
+
+ Replace path_to_dbt_cloud_cli_binary
with the actual path to the dbt Cloud CLI binary, which is /opt/homebrew/bin/dbt
. With this alias, you can use the command dbt-cloud
to invoke the dbt Cloud CLI.source
on the profile file to apply the changes.
+As an example, in bash you would run: source ~/.bashrc
dbt-cloud
command: dbt-cloud command_name
. Replace 'command_name' with the specific dbt command you want to execute.dbt
command: dbt command_name
. Replace 'command_name' with the specific dbt command you want to execute.dbt-cloud
command to invoke the dbt Cloud CLI while having dbt Core installed natively.
+Session occupied
error?Session occupied
error, you can reattach to your existing session with dbt reattach
and then press Control-C
and choose to cancel the invocation.
+--help
flag
+As a tip, most command-line tools have a `--help` flag to show available commands and arguments. Use the `--help` flag with dbt in two ways:
+- `dbt --help`: Lists the commands available for dbtsshd
process.sshd
process. This is responsible for routing traffic to the database. When dbt initiates a job run, an SSH tunnel is created at the start of the run. If this SSH tunnel fails at any point, the job will also fail.Could not deserialize key data
JWT token
limit your_number
at the end of your SQL statement. For example, SELECT * FROM
table limit 100
will return up to 100 rows. Remember that you must write the limit your_number
explicitly and cannot derive it from a macro.SELECT TOP #
, which specifies the number of records to return.dependencies.yml
file?For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}
-`dbt-databricks` is the recommend adapter for Databricks - -`dbt-databricks` includes features not available in `dbt-spark`: +`dbt-databricks` is the recommended adapter for Databricks. It includes features not available in `dbt-spark`, such as: - Unity Catalog support - No need to install additional drivers or dependencies for use on the CLI - Use of Delta Lake for all models out of the box - SQL macros that are optimized to run with [Photon](https://docs.databricks.com/runtime/photon.html) -### Set up a Databricks Target +## Connecting to Databricks + +To connect to a data platform with dbt Core, create the appropriate _profile_ and _target_ YAML keys/values in the `profiles.yml` configuration file for your Databricks SQL Warehouse/cluster. This dbt YAML file lives in the `.dbt/` directory of your user/home directory. For more info, refer to [Connection profiles](/docs/core/connect-data-platform/connection-profiles) and [profiles.yml](/docs/core/connect-data-platform/profiles.yml). + +`dbt-databricks` can connect to Databricks SQL Warehouses and all-purpose clusters. Databricks SQL Warehouses is the recommended way to get started with Databricks. + +Refer to the [Databricks docs](https://docs.databricks.com/dev-tools/dbt.html#) for more info on how to obtain the credentials for configuring your profile. + +### Examples {#examples} + +You can use either token-based authentication or OAuth client-based authentication to connect to Databricks. Refer to the following examples for more info on how to configure your profile for each type of authentication. + +For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}.
+## Configurations -## Authentication Methods +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `type` | redshift | The type of data warehouse you are connecting to| +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `port` | 5439 | | +| `dbname` | my_db | Database name| +| `schema` | my_schema | Schema name| +| `connect_timeout` | `None` or 30 | Number of seconds before connection times out| +| `sslmode` | prefer | optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. For more information on `sslmode`, see Redshift note below| +| `role` | None | Optional, user identifier of the current session| +| `autocreate` | false | Optional, default false. Creates user if they do not exist | +| `db_groups` | ['ANALYSTS'] | Optional. A list of existing database group names that the DbUser joins for the current session | +| `ra3_node` | true | Optional, default False. Enables cross-database sources| +| `autocommit` | true | Optional, default True. Enables autocommit after each statement| +| `retries` | 1 | Number of retries | -### Password-based authentication + +## Authentication Parameters + +The authentication methods that dbt Core supports are: + +- `database` — Password-based authentication (default, will be used if `method` is not provided) +- `IAM` — IAM + +For dbt Cloud users, log in using the default **Database username** and **password**. This is necessary because dbt Cloud does not support `IAM` authentication. + +Click on one of these authentication methods for further details on how to configure your connection profile. Each tab also includes an example `profiles.yml` configuration file for you to review. + +For {frontMatter.meta.platform_name}-specific configuration please refer to {frontMatter.meta.platform_name} Configuration
@@ -80,7 +76,6 @@ dbt-spark can connect to Spark clusters by three different methods: - [`thrift`](#thrift) connects directly to the lead node of a cluster, either locally hosted / on premise or in the cloud (e.g. Amazon EMR). - [`http`](#http) is a more generic method for connecting to a managed service that provides an HTTP endpoint. Currently, this includes connections to a Databricks interactive cluster. -{frontMatter.meta.pypi_package}
pip install {frontMatter.meta.pypi_package}
+
+Installing {frontMatter.meta.pypi_package}
will also install dbt-core
and any other dependencies.
For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration
+ +For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}
+ + +## Authentication Methods + +### User / Password Authentication + +Starrocks can be configured using basic user/password authentication as shown below. + +Installing {frontMatter.meta.pypi_package}
will also install dbt-core
and any other dependencies.
For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration
@@ -88,11 +111,15 @@ The plugin also supports the following optional connection parameters: Parameter | Default | Type | Description ----------------------- | ----------- | -------------- | --- `account` | | string | Specifies the database account. Equivalent to the Teradata JDBC Driver `ACCOUNT` connection parameter. +`browser` | | string | Specifies the command to open the browser for Browser Authentication, when logmech is BROWSER. Browser Authentication is supported for Windows and macOS. Equivalent to the Teradata JDBC Driver BROWSER connection parameter. +`browser_tab_timeout` | `"5"` | quoted integer | Specifies the number of seconds to wait before closing the browser tab after Browser Authentication is completed. The default is 5 seconds. The behavior is under the browser's control, and not all browsers support automatic closing of browser tabs. +`browser_timeout` | `"180"` | quoted integer | Specifies the number of seconds that the driver will wait for Browser Authentication to complete. The default is 180 seconds (3 minutes). `column_name` | `"false"` | quoted boolean | Controls the behavior of cursor `.description` sequence `name` items. Equivalent to the Teradata JDBC Driver `COLUMN_NAME` connection parameter. False specifies that a cursor `.description` sequence `name` item provides the AS-clause name if available, or the column name if available, or the column title. True specifies that a cursor `.description` sequence `name` item provides the column name if available, but has no effect when StatementInfo parcel support is unavailable. `connect_failure_ttl` | `"0"` | quoted integer | Specifies the time-to-live in seconds to remember the most recent connection failure for each IP address/port combination. The driver subsequently skips connection attempts to that IP address/port for the duration of the time-to-live. The default value of zero disables this feature. The recommended value is half the database restart time. Equivalent to the Teradata JDBC Driver `CONNECT_FAILURE_TTL` connection parameter. +`connect_timeout` | `"10000"` | quoted integer | Specifies the timeout in milliseconds for establishing a TCP socket connection. Specify 0 for no timeout. The default is 10 seconds (10000 milliseconds). `cop` | `"true"` | quoted boolean | Specifies whether COP Discovery is performed. Equivalent to the Teradata JDBC Driver `COP` connection parameter. `coplast` | `"false"` | quoted boolean | Specifies how COP Discovery determines the last COP hostname. Equivalent to the Teradata JDBC Driver `COPLAST` connection parameter. When `coplast` is `false` or omitted, or COP Discovery is turned off, then no DNS lookup occurs for the coplast hostname. When `coplast` is `true`, and COP Discovery is turned on, then a DNS lookup occurs for a coplast hostname. -`dbs_port` | `"1025"` | quoted integer | Specifies the database port number. Equivalent to the Teradata JDBC Driver `DBS_PORT` connection parameter. +`port` | `"1025"` | quoted integer | Specifies the database port number. Equivalent to the Teradata JDBC Driver `DBS_PORT` connection parameter. `encryptdata` | `"false"` | quoted boolean | Controls encryption of data exchanged between the driver and the database. Equivalent to the Teradata JDBC Driver `ENCRYPTDATA` connection parameter. `fake_result_sets` | `"false"` | quoted boolean | Controls whether a fake result set containing statement metadata precedes each real result set. `field_quote` | `"\""` | string | Specifies a single character string used to quote fields in a CSV file. @@ -102,11 +129,18 @@ Parameter | Default | Type | Description `lob_support` | `"true"` | quoted boolean | Controls LOB support. Equivalent to the Teradata JDBC Driver `LOB_SUPPORT` connection parameter. `log` | `"0"` | quoted integer | Controls debug logging. Somewhat equivalent to the Teradata JDBC Driver `LOG` connection parameter. This parameter's behavior is subject to change in the future. This parameter's value is currently defined as an integer in which the 1-bit governs function and method tracing, the 2-bit governs debug logging, the 4-bit governs transmit and receive message hex dumps, and the 8-bit governs timing. Compose the value by adding together 1, 2, 4, and/or 8. `logdata` | | string | Specifies extra data for the chosen logon authentication method. Equivalent to the Teradata JDBC Driver `LOGDATA` connection parameter. +`logon_timeout` | `"0"` | quoted integer | Specifies the logon timeout in seconds. Zero means no timeout. `logmech` | `"TD2"` | string | Specifies the logon authentication method. Equivalent to the Teradata JDBC Driver `LOGMECH` connection parameter. Possible values are `TD2` (the default), `JWT`, `LDAP`, `KRB5` for Kerberos, or `TDNEGO`. `max_message_body` | `"2097000"` | quoted integer | Specifies the maximum Response Message size in bytes. Equivalent to the Teradata JDBC Driver `MAX_MESSAGE_BODY` connection parameter. `partition` | `"DBC/SQL"` | string | Specifies the database partition. Equivalent to the Teradata JDBC Driver `PARTITION` connection parameter. +`request_timeout` | `"0"` | quoted integer | Specifies the timeout for executing each SQL request. Zero means no timeout. +`retries` | `0` | integer | Allows an adapter to automatically try again when the attempt to open a new connection on the database has a transient, infrequent error. This option can be set using the retries configuration. Default value is 0. The default wait period between connection attempts is one second. retry_timeout (seconds) option allows us to adjust this waiting period. +`runstartup` | "false" | quoted boolean | Controls whether the user's STARTUP SQL request is executed after logon. For more information, refer to User STARTUP SQL Request. Equivalent to the Teradata JDBC Driver RUNSTARTUP connection parameter. If retries is set to 3, the adapter will try to establish a new connection three times if an error occurs. +`sessions` | | quoted integer | Specifies the number of data transfer connections for FastLoad or FastExport. The default (recommended) lets the database choose the appropriate number of connections. Equivalent to the Teradata JDBC Driver SESSIONS connection parameter. `sip_support` | `"true"` | quoted boolean | Controls whether StatementInfo parcel is used. Equivalent to the Teradata JDBC Driver `SIP_SUPPORT` connection parameter. +`sp_spl` | `"true"` | quoted boolean | Controls whether stored procedure source code is saved in the database when a SQL stored procedure is created. Equivalent to the Teradata JDBC Driver SP_SPL connection parameter. `sslca` | | string | Specifies the file name of a PEM file that contains Certificate Authority (CA) certificates for use with `sslmode` values `VERIFY-CA` or `VERIFY-FULL`. Equivalent to the Teradata JDBC Driver `SSLCA` connection parameter. +`sslcrc` | `"ALLOW"` | string | Equivalent to the Teradata JDBC Driver SSLCRC connection parameter. Values are case-insensitive.{frontMatter.meta.pypi_package}
pip install {frontMatter.meta.pypi_package}
+
+Installing {frontMatter.meta.pypi_package}
will also install dbt-core
and any other dependencies.
For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration
+ +For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}
+ +## Authentication Methods + +### User / Token authentication + +Upsolver can be configured using basic user/token authentication as shown below. + +Field | +Type | +Required? | +Description | +
{name} |
+ {getTypeString(type)} |
+ {type.kind === 'NON_NULL' ? `Yes` : `No`} | +{description || `No description provided`} | +
Field | -Type | -Required? | -Description | -|
{name} |
- {type.ofType ?
- {type.ofType.name} | :
- {type.name} |
+ {type.kind === 'NON_NULL' ? `Yes` : `No`} | -{description || `No description provided`} | -
{name}
[{type.ofType.ofType ? type.ofType.ofType.name : type.ofType.name}]
{type.ofType.name}
{type.name}
{getTypeString(type)}
target/semantic_manifest.json
file generate_schema_name
macro and it isn't using dbt_cloud_pr_
as the prefix.dbt_cloud_pr_
(like dbt_cloud_pr_123_456_marketing
). dbt_cloud_pr_
(like marketing
).
+ Error message:
+ Cloning into '/tmp/jobs/123456/target'...
+ Successfully cloned repository.
+ Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...
+ Failed to checkout to specified revision.
+ git checkout e845be54e6dc72342d5a8f814c8b3316ee220312
+ fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312
+
- Error message:
- Cloning into '/tmp/jobs/123456/target'...
- Successfully cloned repository.
- Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...
- Failed to checkout to specified revision.
- git checkout e845be54e6dc72342d5a8f814c8b3316ee220312
- fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312
-
generate_schema_name
macrodbt_cloud_pr_
(like dbt_cloud_pr_123_456_marketing
) dbt_cloud_pr_
(like marketing
). generate_database_name
macroanalytics
, review the guidance below to resolve this:
- analytics
) dev
). type
Are you ready to define your own metrics and bring consistency to data consumers? Review the following documents to understand how to structure, define, and query metrics, and set up the dbt Semantic Layer:
-- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) to understand best practices for designing and structuring metrics in your dbt project
- [dbt metrics](/docs/build/metrics) for in-depth detail on attributes, properties, filters, and how to define and query metrics
-- [Understanding the components of the dbt Semantic Layer](https://docs.getdbt.com/blog/understanding-the-components-of-the-dbt-semantic-layer) blog post to see further examples
-- [dbt Server repo](https://github.com/dbt-labs/dbt-server), which is a persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations.
+- [dbt Server repo](https://github.com/dbt-labs/dbt-server), which is a persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations.
+
+
diff --git a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md
deleted file mode 100644
index 9d0c1eee752..00000000000
--- a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md
+++ /dev/null
@@ -1,40 +0,0 @@
----
-title: "Set up the dbt Semantic Layer"
-id: setup-dbt-semantic-layer
-description: "You can set up the dbt Semantic Layer in dbt Cloud."
-sidebar_label: "Set up the dbt Semantic Layer"
----
-
-:::info Coming soon
-The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics.
-
-**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt.
-
-**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics.
-:::
-
-With the dbt Semantic Layer, you'll be able to centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool.
-
-## Prerequisites
-
-Before you set up the dbt Semantic Layer, make sure you meet the following:
-
-
-
-
-
-
-
-## Set up dbt Semantic Layer
-
-
-
-
-
-
-## Related docs
-
-- [Integrated partner tools](https://www.getdbt.com/product/semantic-layer-integrations) for info on the different integration partners and their documentation
-- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) page for more information on plan availability
-- [dbt metrics](/docs/build/metrics) for in-depth detail on attributes, properties, filters, and how to define and query metrics
-- [dbt Server repo](https://github.com/dbt-labs/dbt-server), which is a persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations
diff --git a/website/docs/docs/use-dbt-semantic-layer/setup-sl.md b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md
new file mode 100644
index 00000000000..33f1f43f614
--- /dev/null
+++ b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md
@@ -0,0 +1,99 @@
+---
+title: "Set up the dbt Semantic Layer"
+id: setup-sl
+description: "Seamlessly set up the dbt Semantic Layer in dbt Cloud using intuitive navigation."
+sidebar_label: "Set up your Semantic Layer"
+tags: [Semantic Layer]
+---
+
+
+
+
+With the dbt Semantic Layer, you can centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool.
+
+## Prerequisites
+
+
+import SetUp from '/snippets/_v2-sl-prerequisites.md';
+
+
+
+## Set up dbt Semantic Layer
+
+import SlSetUp from '/snippets/_new-sl-setup.md';
+
+
+
+
+
+
+
+
+
+import DeprecationNotice from '/snippets/_sl-deprecation-notice.md';
+
+
+
+With the dbt Semantic Layer, you can define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool.
+
+## Prerequisites
+
+
+
+
+## Set up dbt Semantic Layer
+
+:::tip
+If you're using the legacy Semantic Layer, dbt Labs strongly recommends that you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the latest dbt Semantic Layer. Refer to the dedicated [migration guide](/guides/sl-migration) for more info.
+
+:::
+
+ * Team and Enterprise accounts can set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated partner tool to import metric definitions.
+ * Developer accounts can query the Proxy Server using SQL but won't be able to browse dbt metrics in external tools, which requires access to the Discovery API.
+
+
+1. Log in to your dbt Cloud account.
+2. Go to **Account Settings**, and then **Service Tokens** to create a new [service account API token](/docs/dbt-cloud-apis/service-tokens). Save your token somewhere safe.
+3. Assign permissions to service account tokens depending on the integration tool you choose. Refer to the [integration partner documentation](https://www.getdbt.com/product/semantic-layer-integrations) to determine the permission sets you need to assign.
+4. Go to **Deploy** > **Environments**, and select your **Deployment** environment.
+5. Click **Settings** on the top right side of the page.
+6. Click **Edit** on the top right side of the page.
+7. Select dbt version 1.2 or higher.
+8. Toggle the Semantic Layer **On**.
+9. Copy the full proxy server URL (like `https://eagle-hqya7.proxy.cloud.getdbt.com`) to connect to your [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations).
+10. Use the URL in the data source configuration of the integrated partner tool.
+11. Use the data platform login credentials that make sense for how the data is consumed.
+
+:::info📌
+
+It is _not_ recommended that you use your dbt Cloud credentials due to elevated permissions. Instead, you can use your specific integration tool permissions.
+
+:::
+
+12. Set up the [Discovery API](/docs/dbt-cloud-apis/discovery-api) (Team and Enterprise accounts only) in the integrated partner tool to import the metric definitions. The [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations) will treat the dbt Server as another data source (like a data platform). This requires:
+
+- The account ID, environment ID, and job ID (which is visible in the job URL)
+- An [API service token](/docs/dbt-cloud-apis/service-tokens) with job admin and metadata permissions
+- Add the items above to the relevant fields in your integration tool
+
+
+
+
+
+
+## Related docs
+
+- [Build your metrics](/docs/build/build-metrics-intro)
+- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations)
+- [Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview)
+- [Migrate your legacy Semantic Layer](/guides/sl-migration)
+- [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl)
diff --git a/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md
new file mode 100644
index 00000000000..75a853fcbe8
--- /dev/null
+++ b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md
@@ -0,0 +1,82 @@
+---
+title: "dbt Semantic Layer architecture"
+id: sl-architecture
+description: "dbt Semantic Layer product architecture and related questions."
+sidebar_label: "Architecture"
+tags: [Semantic Layer]
+pagination_next: null
+---
+
+
+
+
+The dbt Semantic Layer allows you to define metrics and use various interfaces to query them. The Semantic Layer does the heavy lifting to find where the queried data exists in your data platform and generates the SQL to make the request (including performing joins).
+
+
+
+## dbt Semantic Layer components
+
+The dbt Semantic Layer includes the following components:
+
+
+| Components | Information | dbt Core users | Developer plans | Team plans | Enterprise plans | License |
+| --- | --- | :---: | :---: | :---: | --- |
+| **[MetricFlow](/docs/build/about-metricflow)** | MetricFlow in dbt allows users to centrally define their semantic models and metrics with YAML specifications. | ✅ | ✅ | ✅ | ✅ | BSL package (code is source available) |
+| **MetricFlow Server**| A proprietary server that takes metric requests and generates optimized SQL for the specific data platform. | ❌ | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)|
+| **Semantic Layer Gateway** | A service that passes queries to the MetricFlow server and executes the SQL generated by MetricFlow against the data platform|
❌ | ❌ |✅ | ✅ | Proprietary, Cloud (Team & Enterprise) |
+| **Semantic Layer APIs** | The interfaces allow users to submit metric queries using GraphQL and JDBC APIs. They also serve as the foundation for building first-class integrations with various tools. | ❌ | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)|
+
+
+## Related questions
+
+
+ How do I migrate from the legacy Semantic Layer to the new one?
+
+ If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
+
+
+
+
+How are you storing my data?
+User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours.
+
+
+ Is the dbt Semantic Layer open source?
+The dbt Semantic Layer is proprietary; however, some components of the dbt Semantic Layer are open source, such as dbt-core and MetricFlow.
dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE, dbt Cloud CLI, or dbt Core CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users must be on a dbt Cloud Team or Enterprise plan.
Refer to Billing for more information.
+
+
+ Is there a dbt Semantic Layer discussion hub?
+
+ Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer.
+
+
+
+
+
+
+
+
+import DeprecationNotice from '/snippets/_sl-deprecation-notice.md';
+
+
+
+## Product architecture
+
+The dbt Semantic Layer product architecture includes four primary components:
+
+| Components | Information | Developer plans | Team plans | Enterprise plans | License |
+| --- | --- | :---: | :---: | :---: | --- |
+| **[dbt project](/docs/build/metrics)** | Define models and metrics in dbt Core.
*Note, we will deprecate and no longer support the dbt_metrics package. | ✅ | ✅ | ✅ | Open source, Core |
+| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL |
+| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅
_* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) |
+| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) |
+
+
+
+dbt Semantic Layer integrations will:
+
+- Leverage the Discovery API to fetch a list of objects and their attributes, like metrics
+- Generate a dbt-SQL statement
+- Then query the SQL proxy to evaluate the results of this statement
+
+
diff --git a/website/docs/docs/use-dbt-semantic-layer/tableau.md b/website/docs/docs/use-dbt-semantic-layer/tableau.md
new file mode 100644
index 00000000000..1d283023dda
--- /dev/null
+++ b/website/docs/docs/use-dbt-semantic-layer/tableau.md
@@ -0,0 +1,82 @@
+---
+title: "Tableau (beta)"
+description: "Use Tableau worksheets to query the dbt Semantic Layer and produce dashboards with trusted date."
+tags: [Semantic Layer]
+sidebar_label: "Tableau (beta)"
+---
+
+:::info Beta functionality
+The Tableau integration with the dbt Semantic Layer is a [beta feature](/docs/dbt-versions/product-lifecycles#dbt-cloud).
+:::
+
+
+The Tableau integration allows you to use worksheets to query the Semantic Layer directly and produce your dashboards with trusted data.
+
+This integration provides a live connection to the dbt Semantic Layer through Tableau Desktop or Tableau Server.
+
+## Prerequisites
+
+- You have [configured the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) and are using dbt v1.6 or higher.
+- You must have [Tableau Desktop](https://www.tableau.com/en-gb/products/desktop) version 2021.1 and greater or Tableau Server.
+ - Note that Tableau Online does not currently support custom connectors natively. If you use Tableau Online, you will only be able to access the connector in Tableau Desktop.
+- Log in to Tableau Desktop (with Online or Server credentials) or a license to Tableau Server
+- You need your dbt Cloud host, [Environment ID](/docs/use-dbt-semantic-layer/setup-sl#set-up-dbt-semantic-layer) and [service token](/docs/dbt-cloud-apis/service-tokens) to log in. This account should be set up with the dbt Semantic Layer.
+- You must have a dbt Cloud Team or Enterprise [account](https://www.getdbt.com/pricing) and multi-tenant [deployment](/docs/cloud/about-cloud/regions-ip-addresses). (Single-Tenant coming soon)
+
+
+## Installing the Connector
+
+1. Download the GitHub [connector file](https://github.com/dbt-labs/semantic-layer-tableau-connector/releases/download/v1.0.2/dbt_semantic_layer.taco) locally and add it to your default folder:
+
+| Operating system |Tableau Desktop | Tableau Server |
+| ---------------- | -------------- | -------------- |
+| Windows | `C:\Users\\[Windows User]\Documents\My Tableau Repository\Connectors` | `C:\Program Files\Tableau\Connectors` |
+| Mac | `/Users/[user]/Documents/My Tableau Repository/Connectors` | Not applicable |
+| Linux | `/opt/tableau/connectors` | `/opt/tableau/connectors` |
+
+2. Install the [JDBC driver](/docs/dbt-cloud-apis/sl-jdbc) to the folder based on your operating system:
+ - Windows: `C:\Program Files\Tableau\Drivers`
+ - Mac: `~/Library/Tableau/Drivers`
+ - Linux: ` /opt/tableau/tableau_driver/jdbc`
+3. Open Tableau Desktop or Tableau Server and find the **dbt Semantic Layer by dbt Labs** connector on the left-hand side. You may need to restart these applications for the connector to be available.
+4. Connect with your Host, Environment ID, and Service Token information dbt Cloud provides during [Semantic Layer configuration](/docs/use-dbt-semantic-layer/setup-sl#:~:text=After%20saving%20it%2C%20you%27ll%20be%20provided%20with%20the%20connection%20information%20that%20allows%20you%20to%20connect%20to%20downstream%20tools).
+ - In Tableau Server, the authentication screen may show "User" & "Password" instead, in which case the User is the Environment ID and the password is the Service Token.
+
+
+## Using the integration
+
+1. **Authentication** — Once you authenticate, the system will direct you to the data source page with all the metrics and dimensions configured in your dbt Semantic Layer.
+2. **Access worksheet** — From there, go directly to a worksheet in the bottom left-hand corner.
+3. **Access metrics and dimensions** — Then, you'll find all the metrics and dimensions that are available to query on the left side of your window.
+
+Visit the [Tableau documentation](https://help.tableau.com/current/pro/desktop/en-us/gettingstarted_overview.htm) to learn more about how to use Tableau worksheets and dashboards.
+
+### Publish from Tableau Desktop to Tableau Server
+
+- **From Desktop to Server** — Like any Tableau workflow, you can publish your workbook from Tableau Desktop to Tableau Server. For step-by-step instructions, visit Tableau's [publishing guide](https://help.tableau.com/current/pro/desktop/en-us/publish_workbooks_share.htm).
+
+
+## Things to note
+
+- All metrics use the "SUM" aggregation type, and this can't be altered. The dbt Semantic Layer controls the aggregation type and it is intentionally fixed. Keep in mind that the underlying aggregation in the dbt Semantic Layer might not be "SUM" (even though "SUM" is Tableau's default).
+- Tableau surfaces all metrics and dimensions from the dbt Semantic Layer on the left-hand side. Note, that not all metrics and dimensions can be combined with one another. You will receive an error message if a particular dimension cannot be sliced with a metric (or vice versa).
+ - To display available metrics and dimensions, dbt Semantic Layer returns metadata for a fake table with the dimensions and metrics as 'columns' on this table. Because of this, you can't actually query this table for previews or extracts.
+ - Since this is treated as a table, dbt Semantic Layer can't dynamically change what is available. This means we display _all_ available metrics and dimensions even if a particular metric and dimension combination isn't available.
+
+- Certain Table calculations like "Totals" and "Percent Of" may not be accurate when using metrics aggregated in a non-additive way (such as count distinct)
+- In any of our Semantic Layer interfaces (not only Tableau), you must include a [time dimension](/docs/build/cumulative#limitations) when working with any cumulative metric that has a time window or granularity.
+
+## Unsupported functionality
+
+The following Tableau features aren't supported at this time, however, the dbt Semantic Layer may support some of this functionality in a future release:
+
+- Updating the data source page
+- Using "Extract" mode to view your data
+- Unioning Tables
+- Writing Custom SQL / Initial SQL
+- Table Extensions
+- Cross-Database Joins
+- All functions in Analysis --> Create Calculated Field
+- Filtering on a Date Part time dimension for a Cumulative metric type
+- Changing your date dimension to use "Week Number"
+
diff --git a/website/docs/docs/verified-adapters.md b/website/docs/docs/verified-adapters.md
index 9604d05391c..75c7529c247 100644
--- a/website/docs/docs/verified-adapters.md
+++ b/website/docs/docs/verified-adapters.md
@@ -1,30 +1,24 @@
---
title: "Verified adapters"
id: "verified-adapters"
+hide_table_of_contents: true
---
-The dbt Labs has a rigorous verified adapter program which provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers aspects of development, documentation, user experience, and maintenance.
+The dbt Labs has a rigorous verified adapter program that provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers development, documentation, user experience, and maintenance aspects.
These adapters then earn a "Verified" status so that users can have a certain level of trust and expectation when they use them. The adapters also have maintainers and we recommend using the adapter's verification status to determine its quality and health.
-Here's the list of the verified data platforms that can connect to dbt and its latest version.
+The verification process serves as the on-ramp to integration with dbt Cloud. As such, we restrict applicants to data platform vendors with whom we are already engaged.
-| dbt Cloud setup | CLI installation | latest verified version |
-| ---------------- | ----------------------------------------- | ------------------------ |
-| [Setup AlloyDB](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install AlloyDB](/docs/core/connect-data-platform/alloydb-setup) | (same as `dbt-postgres`) |
-| Not supported | [Install Azure Synapse](/docs/core/connect-data-platform/azuresynapse-setup) | 1.3 :construction: |
-| [Set up BigQuery](/docs/cloud/connect-data-platform/connect-bigquery) | [Install BigQuery](/docs/core/connect-data-platform/bigquery-setup) | 1.4 |
-| [Set up Databricks ](/docs/cloud/connect-data-platform/connect-databricks)| [ Install Databricks](/docs/core/connect-data-platform/databricks-setup) | 1.4 |
-| Not supported | [Install Dremio](/docs/core/connect-data-platform/dremio-setup) | 1.4 :construction: |
-| [Set up Postgres](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install Postgres](/docs/core/connect-data-platform/postgres-setup) | 1.4 |
-| [Set up Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install Redshift](/docs/core/connect-data-platform/redshift-setup) | 1.4 |
-| [Set up Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) | [ Install Snowflake](/docs/core/connect-data-platform/snowflake-setup) | 1.4 |
-| [Set up Spark](/docs/cloud/connect-data-platform/connect-apache-spark) | [Install Spark](/docs/core/connect-data-platform/spark-setup) | 1.4 |
-| [Set up Starburst & Trino](/docs/cloud/connect-data-platform/connect-starburst-trino)| [Installl Starburst & Trino](/docs/core/connect-data-platform/trino-setup) | 1.4 |
+To learn more, refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide.
-:construction:: Verification in progress
+import MSCallout from '/snippets/_microsoft-adapters-soon.md';
-To learn more, see [Verifying a new adapter](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter).
+
+Here are the verified data platforms that connect to dbt and its latest version.
+import AdaptersVerified from '/snippets/_adapters-verified.md';
+
+
diff --git a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md
index 76d03870478..f8daf393f9b 100644
--- a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md
+++ b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md
@@ -38,7 +38,7 @@ To unlock your account and select a plan, review the following guidance per plan
2. To unlock your account and continue using the Team plan, you need to enter your payment details.
3. Go to **Payment Information** and click **Edit** on the right.
4. Enter your payment details and click **Save**.
-5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉
+5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉
@@ -59,7 +59,7 @@ For commonly asked billings questions, refer to the dbt Cloud [pricing page](htt
How does billing work?
- Team plans are billed monthly on the credit card used to sign up, based on developer seat count. You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings -> Billing page.
+ Team plans are billed monthly on the credit card used to sign up, based on [developer seat count and usage](/docs/cloud/billing). You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings > Billing page.
Enterprise plan customers are billed annually based on the number of developer seats, as well as any additional services + features in your chosen plan.
@@ -75,7 +75,7 @@ For commonly asked billings questions, refer to the dbt Cloud [pricing page](htt
Can I pay by invoice?
- At present, dbt Cloud Team plan payments must be made via credit card, and by default they will be billed monthly based on the number of developer seats.
+ Currently, dbt Cloud Team plan payments must be made with a credit card, and by default they will be billed monthly based on the number of [developer seats and usage](/docs/cloud/billing).
We don’t have any plans to do invoicing for Team plan accounts in the near future, but we do currently support invoices for companies on the dbt Cloud Enterprise plan. Feel free to contact us to build your Enterprise pricing plan.
diff --git a/website/docs/faqs/Accounts/payment-accepted.md b/website/docs/faqs/Accounts/payment-accepted.md
index 2e26063c684..c0e949833a2 100644
--- a/website/docs/faqs/Accounts/payment-accepted.md
+++ b/website/docs/faqs/Accounts/payment-accepted.md
@@ -5,6 +5,6 @@ sidebar_label: 'Can I pay invoice'
id: payment-accepted
---
-Presently for Team plans, self-service dbt Cloud payments must be made via credit card and by default, they will be billed monthly based on the number of active developer seats.
+Currently for Team plans, self-service dbt Cloud payments must be made with a credit card and by default, they will be billed monthly based on the number of [active developer seats and usage](/docs/cloud/billing).
We don't have any plans to do invoicing for self-service teams in the near future, but we *do* currently support invoices for companies on the **dbt Cloud Enterprise plan.** Feel free to [contact us](https://www.getdbt.com/contact) to build your Enterprise pricing.
diff --git a/website/docs/faqs/Accounts/slack.md b/website/docs/faqs/Accounts/slack.md
deleted file mode 100644
index 4faa60fb09a..00000000000
--- a/website/docs/faqs/Accounts/slack.md
+++ /dev/null
@@ -1,8 +0,0 @@
----
-title: How do I set up Slack notifications?
-description: "Instructions on how to set up slack notifications"
-sidebar_label: 'How to set up Slack'
-id: slack
----
-
-
diff --git a/website/docs/faqs/Core/install-python-compatibility.md b/website/docs/faqs/Core/install-python-compatibility.md
index d24466f4990..5c536101f0c 100644
--- a/website/docs/faqs/Core/install-python-compatibility.md
+++ b/website/docs/faqs/Core/install-python-compatibility.md
@@ -17,18 +17,12 @@ The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.
-
+
The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.9, and 3.10
-
-
-As of v1.0, `dbt-core` is compatible with Python versions 3.7, 3.8, and 3.9.
-
-
-
Adapter plugins and their dependencies are not always compatible with the latest version of Python. For example, dbt-snowflake v0.19 is not compatible with Python 3.9, but dbt-snowflake versions 0.20+ are.
New dbt minor versions will add support for new Python3 minor versions as soon as all dependencies can support it. In turn, dbt minor versions will drop support for old Python3 minor versions right before they reach [end of life](https://endoflife.date/python).
diff --git a/website/docs/faqs/Docs/documenting-macros.md b/website/docs/faqs/Docs/documenting-macros.md
index cbc12b988c6..9a2036cd6bf 100644
--- a/website/docs/faqs/Docs/documenting-macros.md
+++ b/website/docs/faqs/Docs/documenting-macros.md
@@ -5,8 +5,6 @@ sidebar_label: 'Document macros'
id: documenting-macros
---
-The `macros:` key is new in 0.16.0.
-
To document macros, use a [schema file](/reference/macro-properties) and nest the configurations under a `macros:` key
## Example
diff --git a/website/docs/faqs/Environments/beta-release.md b/website/docs/faqs/Environments/beta-release.md
deleted file mode 100644
index 5eef07d3510..00000000000
--- a/website/docs/faqs/Environments/beta-release.md
+++ /dev/null
@@ -1,7 +0,0 @@
----
-title: What is a beta release?
-description: "How to try out beta features"
-sidebar_label: 'What is a beta release?'
-id: beta-release
----
-This is a chance to try out brand-new functionality. You get to start planning for use cases that the next minor version will unlock. We get to hear from you about unexpected behavior and nasty bugs, so that the release candidate has more polish and fewer surprises.
diff --git a/website/docs/faqs/Environments/custom-branch-settings.md b/website/docs/faqs/Environments/custom-branch-settings.md
index 95929d2d393..4bc4b85be02 100644
--- a/website/docs/faqs/Environments/custom-branch-settings.md
+++ b/website/docs/faqs/Environments/custom-branch-settings.md
@@ -1,7 +1,7 @@
---
-title: How do I use the `Custom Branch` settings in a dbt Cloud Environment?
+title: How do I use the 'Custom Branch' settings in a dbt Cloud Environment?
description: "Use custom code from your repository"
-sidebar_label: 'Custom Branch settings'
+sidebar_label: 'Custom branch settings'
id: custom-branch-settings
---
@@ -15,12 +15,21 @@ To specify a custom branch:
## Development
-In a development environment, the default branch (commonly the `main` branch) is a read-only branch found in the IDE's connected repositories, which you can use to create development branches. Identifying a custom branch overrides this default behavior. Instead, your custom branch becomes read-only and can be used to create development branches. You will no longer be able to make commits to the custom branch from within the dbt Cloud IDE.
+In a development environment, the default branch (usually named `main`) is a read-only branch in your connected repositories, which allows you to create new branches for development from it.
-For example, you can use the `develop` branch of a connected repository. Edit an environment, select **Only run on a custom branch** in **General settings** , enter **develop** as the name of your custom branch.
+Specifying a **Custom branch** overrides the default behavior. It makes the custom branch 'read-only' and enables you to create new development branches from it. This also means you can't edit this custom branch directly.
-
+Only one branch can be read-only, which means when you set up a custom branch, your `main` branch (usually read-only) becomes editable. If you want to protect the `main` branch and prevent any commits on it, you need to set up branch protection rules in your git provider settings. This ensures your `main` branch remains secure and no new commits can be made to it.
+
+For example, if you want to use the `develop` branch of a connected repository:
+
+- Go to an environment and select **Settings** to edit it
+- Select **Only run on a custom branch** in **General settings**
+- Enter **develop** as the name of your custom branch
+- Click **Save**
+
+
## Deployment
-When running jobs in a deployment environment, dbt will clone your project from your connected repository before executing your models. By default, dbt uses the default branch of your repository (commonly the `main` branch). To specify a different version of your project for dbt to execute during job runs in a particular environment, you can edit the Custom Branch setting as shown in the previous steps.
\ No newline at end of file
+When running jobs in a deployment environment, dbt will clone your project from your connected repository before executing your models. By default, dbt uses the default branch of your repository (commonly the `main` branch). To specify a different version of your project for dbt to execute during job runs in a particular environment, you can edit the Custom Branch setting as shown in the previous steps.
diff --git a/website/docs/faqs/Environments/delete-environment-job.md b/website/docs/faqs/Environments/delete-environment-job.md
index b649769f070..eb9ac511a7c 100644
--- a/website/docs/faqs/Environments/delete-environment-job.md
+++ b/website/docs/faqs/Environments/delete-environment-job.md
@@ -8,16 +8,7 @@ id: delete-environment-job
To delete an environment or job in dbt Cloud, you must have a `developer` [license](/docs/cloud/manage-access/seats-and-users) and have the necessary [access permissions](/docs/cloud/manage-access/about-user-access).
-:::info 📌 Delete a job first before deleting environment
-
-Deleting an environment doesn't automatically delete its associated job(s). If you delete an environment first without deleting the job, you won't be able to delete the job since it's without an environment.
-
-To completely delete your environment, you _must_:
-1. First delete all jobs associated with that environment,
-2. Then, delete the environment.
-:::
-
-**Delete a job**
+## Delete a job
To delete a job or multiple jobs in dbt Cloud:
@@ -33,11 +24,11 @@ To delete a job or multiple jobs in dbt Cloud:
5. Confirm your action in the **Confirm Delete** pop-up by clicking **Confirm Delete** in the bottom right to delete the job immediately. This action cannot be undone. However, you can create a new job with the same information if the deletion was made in error.
-Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each individual job.
+Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each job.
-**Delete an environment**
+## Delete an environment
-To delete an environment in dbt Cloud:
+Deleting an environment automatically deletes its associated job(s). If you want to keep those jobs, move them to a different environment first. To delete an environment in dbt Cloud:
1. Click **Deploy** on the navigation header and then click **Environments**
2. Select the Environment you want to delete.
@@ -54,4 +45,4 @@ To delete an environment in dbt Cloud:
Refresh your page, and the deleted environment should now be gone. If you want to delete multiple environments, you'll need to perform these steps to delete each one.
-If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help.
\ No newline at end of file
+If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help.
diff --git a/website/docs/faqs/Git/git-migration.md b/website/docs/faqs/Git/git-migration.md
new file mode 100644
index 00000000000..775ae3679e3
--- /dev/null
+++ b/website/docs/faqs/Git/git-migration.md
@@ -0,0 +1,26 @@
+---
+title: "How to migrate git providers"
+sidebar_label: "How to migrate git providers"
+id: "git-migration"
+hide_table_of_contents: true
+description: "Learn how to migrate git providers in dbt Cloud with minimal disruption."
+tags: [Git]
+---
+
+To migrate from one git provider to another, refer to the following steps to avoid minimal disruption:
+
+1. Outside of dbt Cloud, you'll need to import your existing repository into your new provider.
+
+ As an example, if you're migrating from GitHub to Azure DevOps, you'll need to import your existing repository (GitHub) into your new git provider (Azure DevOps). For detailed steps on how to do this, refer to your git provider's documentation (Such as [GitHub](https://docs.github.com/en/migrations/importing-source-code/using-github-importer/importing-a-repository-with-github-importer), [GitLab](https://docs.gitlab.com/ee/user/project/import/repo_by_url.html), [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/repos/git/import-git-repository?view=azure-devops))
+
+2. Go back to dbt Cloud and set up your [integration for the new git provider](/docs/cloud/git/connect-github), if needed.
+3. Disconnect the old repository in dbt Cloud by going to **Account Settings** and then **Projects**. Click on the **Repository** link, then click **Edit** and **Disconnect**.
+
+
+
+4. On the same page, connect to the new git provider repository by clicking **Configure Repository**
+ - If you're using the native integration, you may need to OAuth to it.
+
+5. That's it, you should now be connected to the new git provider! 🎉
+
+Note — As a tip, we recommend you refresh your page and dbt Cloud IDE before performing any actions.
diff --git a/website/docs/faqs/Jinja/jinja-whitespace.md b/website/docs/faqs/Jinja/jinja-whitespace.md
index 49ced7183b7..5e1ec3dc7ac 100644
--- a/website/docs/faqs/Jinja/jinja-whitespace.md
+++ b/website/docs/faqs/Jinja/jinja-whitespace.md
@@ -7,6 +7,6 @@ id: jinja-whitespace
This is known as "whitespace control".
-Use a minus sign (`-`, e.g. `{{- ... -}}`, `{%- ... %}`, `{#- ... -#}`) at the start or end of a block to strip whitespace before or after the block (more docs [here](https://jinja.palletsprojects.com/page/templates/#whitespace-control)). Check out the [tutorial on using Jinja](/guides/advanced/using-jinja#use-whitespace-control-to-tidy-up-compiled-code) for an example.
+Use a minus sign (`-`, e.g. `{{- ... -}}`, `{%- ... %}`, `{#- ... -#}`) at the start or end of a block to strip whitespace before or after the block (more docs [here](https://jinja.palletsprojects.com/page/templates/#whitespace-control)). Check out the [tutorial on using Jinja](/guides/using-jinja#use-whitespace-control-to-tidy-up-compiled-code) for an example.
Take caution: it's easy to fall down a rabbit hole when it comes to whitespace control!
diff --git a/website/docs/faqs/Models/available-materializations.md b/website/docs/faqs/Models/available-materializations.md
index 25ba745a2b2..bf11c92b595 100644
--- a/website/docs/faqs/Models/available-materializations.md
+++ b/website/docs/faqs/Models/available-materializations.md
@@ -5,6 +5,7 @@ sidebar_label: 'Materializations available'
id: available-materializations
---
-dbt ships with four materializations : `view`, `table`, `incremental` and `ephemeral`. Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options.
+dbt ships with five materializations : `view`, `table`, `incremental`, `ephemeral` and `materialized_view`.
+Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options.
-You can also create your own [custom materializations](/guides/advanced/creating-new-materializations), if required however this is an advanced feature of dbt.
+You can also create your own [custom materializations](/guides/create-new-materializations), if required however this is an advanced feature of dbt.
diff --git a/website/docs/faqs/Models/configurable-model-path.md b/website/docs/faqs/Models/configurable-model-path.md
index 6e8861a0693..c34112a5fe1 100644
--- a/website/docs/faqs/Models/configurable-model-path.md
+++ b/website/docs/faqs/Models/configurable-model-path.md
@@ -6,12 +6,6 @@ id: configurable-model-path
---
-
-
-- **v1.0.0:** The config 'source-path' has been deprecated in favor of [`model-paths`](/reference/project-configs/model-paths).
-
-
-
By default, dbt expects the files defining your models to be located in the `models` subdirectory of your project.
To change this, update the [model-paths](reference/project-configs/model-paths.md) configuration in your `dbt_project.yml`
diff --git a/website/docs/faqs/Models/create-dependencies.md b/website/docs/faqs/Models/create-dependencies.md
index 6a01aa18dca..e902d93b018 100644
--- a/website/docs/faqs/Models/create-dependencies.md
+++ b/website/docs/faqs/Models/create-dependencies.md
@@ -44,4 +44,4 @@ Found 2 models, 28 tests, 0 snapshots, 0 analyses, 130 macros, 0 operations, 0 s
Done. PASS=2 WARN=0 ERROR=0 SKIP=0 TOTAL=2
```
-To learn more about building a dbt project, we recommend you complete the [quickstart guide](/quickstarts).
+To learn more about building a dbt project, we recommend you complete the [quickstart guide](/guides).
diff --git a/website/docs/faqs/Models/reference-models-in-another-project.md b/website/docs/faqs/Models/reference-models-in-another-project.md
deleted file mode 100644
index 19f3f52da31..00000000000
--- a/website/docs/faqs/Models/reference-models-in-another-project.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-title: How can I reference models or macros in another project?
-description: "Use packages to add another project to your dbt project"
-sidebar_label: 'Reference models or macros in another project'
-id: reference-models-in-another-project
-
----
-
-You can use [packages](/docs/build/packages) to add another project to your dbt
-project, including other projects you've created. Check out the [docs](/docs/build/packages)
-for more information!
diff --git a/website/docs/faqs/Project/docs-for-multiple-projects.md b/website/docs/faqs/Project/docs-for-multiple-projects.md
deleted file mode 100644
index b7aa1452b39..00000000000
--- a/website/docs/faqs/Project/docs-for-multiple-projects.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-title: Can I render docs for multiple projects?
-description: "Using packages to render docs for multiple projects"
-sidebar_label: 'Render docs for multiple projects'
-id: docs-for-multiple-projects
-
----
-
-Yes! To do this, you'll need to create a "super project" that lists each project as a dependent [package](/docs/build/packages) in a `packages.yml` file. Then run `dbt deps` to install the projects as packages, prior to running `dbt docs generate`.
-
-If you are going down the route of multiple projects, be sure to check out our advice [1](https://discourse.getdbt.com/t/should-i-have-an-organisation-wide-project-a-monorepo-or-should-each-work-flow-have-their-own/666) [2](https://discourse.getdbt.com/t/how-to-configure-your-dbt-repository-one-or-many/2121) on the topic.
diff --git a/website/docs/faqs/Project/example-projects.md b/website/docs/faqs/Project/example-projects.md
index f59d6e56e78..cd58c8832e2 100644
--- a/website/docs/faqs/Project/example-projects.md
+++ b/website/docs/faqs/Project/example-projects.md
@@ -8,7 +8,7 @@ id: example-projects
Yes!
-* **Quickstart Tutorial:** You can build your own example dbt project in the [quickstart guide](/quickstarts)
+* **Quickstart Tutorial:** You can build your own example dbt project in the [quickstart guide](/guides)
* **Jaffle Shop:** A demonstration project (closely related to the tutorial) for a fictional ecommerce store ([source code](https://github.com/dbt-labs/jaffle_shop))
* **MRR Playbook:** A demonstration project that models subscription revenue ([source code](https://github.com/dbt-labs/mrr-playbook), [docs](https://www.getdbt.com/mrr-playbook/#!/overview))
* **Attribution Playbook:** A demonstration project that models marketing attribution ([source code](https://github.com/dbt-labs/attribution-playbook), [docs](https://www.getdbt.com/attribution-playbook/#!/overview))
diff --git a/website/docs/faqs/Project/multiple-resource-yml-files.md b/website/docs/faqs/Project/multiple-resource-yml-files.md
index 422b7beb702..04e1702a162 100644
--- a/website/docs/faqs/Project/multiple-resource-yml-files.md
+++ b/website/docs/faqs/Project/multiple-resource-yml-files.md
@@ -9,4 +9,4 @@ It's up to you:
- Some folks find it useful to have one file per model (or source / snapshot / seed etc)
- Some find it useful to have one per directory, documenting and testing multiple models in one file
-Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview).
+Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](/best-practices/how-we-structure/1-guide-overview).
diff --git a/website/docs/faqs/Project/resource-yml-name.md b/website/docs/faqs/Project/resource-yml-name.md
index 8a6ebe96134..c26cff26474 100644
--- a/website/docs/faqs/Project/resource-yml-name.md
+++ b/website/docs/faqs/Project/resource-yml-name.md
@@ -10,4 +10,4 @@ It's up to you! Here's a few options:
- Use the same name as your directory (assuming you're using sensible names for your directories)
- If you test and document one model (or seed, snapshot, macro etc.) per file, you can give it the same name as the model (or seed, snapshot, macro etc.)
-Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview).
+Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](/best-practices/how-we-structure/1-guide-overview).
diff --git a/website/docs/faqs/Project/structure-a-project.md b/website/docs/faqs/Project/structure-a-project.md
index 5d73f9f25ba..a9ef53f5c8f 100644
--- a/website/docs/faqs/Project/structure-a-project.md
+++ b/website/docs/faqs/Project/structure-a-project.md
@@ -8,4 +8,4 @@ id: structure-a-project
There's no one best way to structure a project! Every organization is unique.
-If you're just getting started, check out how we (dbt Labs) [structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview).
+If you're just getting started, check out how we (dbt Labs) [structure our dbt projects](/best-practices/how-we-structure/1-guide-overview).
diff --git a/website/docs/faqs/Project/which-schema.md b/website/docs/faqs/Project/which-schema.md
index f0634ac8c85..2c21cba3c6a 100644
--- a/website/docs/faqs/Project/which-schema.md
+++ b/website/docs/faqs/Project/which-schema.md
@@ -7,7 +7,7 @@ id: which-schema
---
By default, dbt builds models in your target schema. To change your target schema:
* If you're developing in **dbt Cloud**, these are set for each user when you first use a development environment.
-* If you're developing with the **dbt CLI**, this is the `schema:` parameter in your `profiles.yml` file.
+* If you're developing with **dbt Core**, this is the `schema:` parameter in your `profiles.yml` file.
If you wish to split your models across multiple schemas, check out the docs on [using custom schemas](/docs/build/custom-schemas).
diff --git a/website/docs/faqs/Project/why-not-write-dml.md b/website/docs/faqs/Project/why-not-write-dml.md
index fd2cea7d3ad..210ef4a916d 100644
--- a/website/docs/faqs/Project/why-not-write-dml.md
+++ b/website/docs/faqs/Project/why-not-write-dml.md
@@ -30,4 +30,4 @@ You can test your models, generate documentation, create snapshots, and more!
SQL dialects tend to diverge the most in DML and DDL (rather than in `select` statements) — check out the example [here](/faqs/models/sql-dialect). By writing less SQL, it can make a migration to a new database technology easier.
-If you do need to write custom DML, there are ways to do this in dbt using [custom materializations](/guides/advanced/creating-new-materializations).
+If you do need to write custom DML, there are ways to do this in dbt using [custom materializations](/guides/create-new-materializations).
diff --git a/website/docs/faqs/Runs/checking-logs.md b/website/docs/faqs/Runs/checking-logs.md
index dbfdb6806a1..ff5e6f5cf04 100644
--- a/website/docs/faqs/Runs/checking-logs.md
+++ b/website/docs/faqs/Runs/checking-logs.md
@@ -10,7 +10,7 @@ To check out the SQL that dbt is running, you can look in:
* dbt Cloud:
* Within the run output, click on a model name, and then select "Details"
-* dbt CLI:
+* dbt Core:
* The `target/compiled/` directory for compiled `select` statements
* The `target/run/` directory for compiled `create` statements
* The `logs/dbt.log` file for verbose logging.
diff --git a/website/docs/faqs/Runs/failed-tests.md b/website/docs/faqs/Runs/failed-tests.md
index bfee565ef61..d19023d035d 100644
--- a/website/docs/faqs/Runs/failed-tests.md
+++ b/website/docs/faqs/Runs/failed-tests.md
@@ -10,7 +10,7 @@ To debug a failing test, find the SQL that dbt ran by:
* dbt Cloud:
* Within the test output, click on the failed test, and then select "Details"
-* dbt CLI:
+* dbt Core:
* Open the file path returned as part of the error message.
* Navigate to the `target/compiled/schema_tests` directory for all compiled test queries
diff --git a/website/docs/faqs/Tests/configurable-data-path.md b/website/docs/faqs/Tests/configurable-data-path.md
index 7c4e92f7226..7663d2d3f11 100644
--- a/website/docs/faqs/Tests/configurable-data-path.md
+++ b/website/docs/faqs/Tests/configurable-data-path.md
@@ -6,12 +6,6 @@ id: configurable-data-path
---
-
-
-- **v1.0.0:** The config 'data-paths' has been deprecated in favor of [`seed-paths`](/reference/project-configs/seed-paths).
-
-
-
By default, dbt expects your seed files to be located in the `seeds` subdirectory
of your project.
diff --git a/website/docs/faqs/Tests/custom-test-thresholds.md b/website/docs/faqs/Tests/custom-test-thresholds.md
index 7155b39d25e..34d2eec7494 100644
--- a/website/docs/faqs/Tests/custom-test-thresholds.md
+++ b/website/docs/faqs/Tests/custom-test-thresholds.md
@@ -11,4 +11,4 @@ As of `v0.20.0`, you can use the `error_if` and `warn_if` configs to set custom
For dbt `v0.19.0` and earlier, you could try these possible solutions:
* Setting the [severity](/reference/resource-properties/tests#severity) to `warn`, or:
-* Writing a [custom generic test](/guides/best-practices/writing-custom-generic-tests) that accepts a threshold argument ([example](https://discourse.getdbt.com/t/creating-an-error-threshold-for-schema-tests/966))
+* Writing a [custom generic test](/best-practices/writing-custom-generic-tests) that accepts a threshold argument ([example](https://discourse.getdbt.com/t/creating-an-error-threshold-for-schema-tests/966))
diff --git a/website/docs/faqs/Tests/testing-seeds.md b/website/docs/faqs/Tests/testing-seeds.md
index 93afcab2fa4..3b1b3e0df56 100644
--- a/website/docs/faqs/Tests/testing-seeds.md
+++ b/website/docs/faqs/Tests/testing-seeds.md
@@ -6,8 +6,6 @@ id: testing-seeds
---
-The `seeds:` key is new in 0.16.0. Prior to this, use a `models:` key instead.
-
To test and document seeds, use a [schema file](/reference/configs-and-properties) and nest the configurations under a `seeds:` key
## Example
diff --git a/website/docs/faqs/Warehouse/bq-oauth-drive-scope.md b/website/docs/faqs/Warehouse/bq-oauth-drive-scope.md
new file mode 100644
index 00000000000..ae6da82c47a
--- /dev/null
+++ b/website/docs/faqs/Warehouse/bq-oauth-drive-scope.md
@@ -0,0 +1,8 @@
+---
+title: Why does the BigQuery OAuth application require scopes to Google Drive?
+description: "Learn more about Google Drive scopes in the BigQuery OAuth application"
+sidebar_label: "BigQuery OAuth Drive Scopes"
+id: bq-oauth-drive-scope
+---
+
+BigQuery supports external tables over both personal Google Drive files and shared files. For more information, refer to [Create Google Drive external tables](https://cloud.google.com/bigquery/docs/external-data-drive).
diff --git a/website/docs/faqs/Warehouse/database-privileges.md b/website/docs/faqs/Warehouse/database-privileges.md
index 73e0549f130..3761b81fe67 100644
--- a/website/docs/faqs/Warehouse/database-privileges.md
+++ b/website/docs/faqs/Warehouse/database-privileges.md
@@ -12,8 +12,8 @@ schema¹
* read system views to generate documentation (i.e. views in
`information_schema`)
-On Postgres, Redshift, and Snowflake, use a series of `grants` to ensure that
-your user has the correct privileges.
+On Postgres, Redshift, Databricks, and Snowflake, use a series of `grants` to ensure that
+your user has the correct privileges. Check out [example permissions](/reference/database-permissions/about-database-permissions) for these warehouses.
On BigQuery, use the "BigQuery User" role to assign these privileges.
diff --git a/website/docs/faqs/Warehouse/db-connection-dbt-compile.md b/website/docs/faqs/Warehouse/db-connection-dbt-compile.md
index d8e58155b10..8017da4545b 100644
--- a/website/docs/faqs/Warehouse/db-connection-dbt-compile.md
+++ b/website/docs/faqs/Warehouse/db-connection-dbt-compile.md
@@ -22,7 +22,7 @@ To generate the compiled SQL for many models, dbt needs to run introspective que
These introspective queries include:
-- Populating the [relation cache](/guides/advanced/creating-new-materializations#update-the-relation-cache). Caching speeds up the metadata checks, including whether an [incremental model](/docs/build/incremental-models) already exists in the data platform.
+- Populating the relation cache. For more information, refer to the [Create new materializations](/guides/create-new-materializations) guide. Caching speeds up the metadata checks, including whether an [incremental model](/docs/build/incremental-models) already exists in the data platform.
- Resolving [macros](/docs/build/jinja-macros#macros), such as `run_query` or `dbt_utils.get_column_values` that you're using to template out your SQL. This is because dbt needs to run those queries during model SQL compilation.
Without a data platform connection, dbt can't perform these introspective queries and won't be able to generate the compiled SQL needed for the next steps in the dbt workflow. You can [`parse`](/reference/commands/parse) a project and use the [`list`](/reference/commands/list) resources in the project, without an internet or data platform connection. Parsing a project is enough to produce a [manifest](/reference/artifacts/manifest-json), however, keep in mind that the written-out manifest won't include compiled SQL.
diff --git a/website/docs/guides/adapter-creation.md b/website/docs/guides/adapter-creation.md
new file mode 100644
index 00000000000..8a9145f0258
--- /dev/null
+++ b/website/docs/guides/adapter-creation.md
@@ -0,0 +1,1352 @@
+---
+title: Build, test, document, and promote adapters
+id: adapter-creation
+description: "Create an adapter that connects dbt to you platform, and learn how to maintain and version that adapter."
+hoverSnippet: "Learn how to build, test, document, and promote adapters as well as maintaining and versioning an adapter."
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Adapter creation']
+level: 'Advanced'
+recently_updated: true
+---
+
+## Introduction
+
+Adapters are an essential component of dbt. At their most basic level, they are how dbt connects with the various supported data platforms. At a higher-level, dbt Core adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt Core.
+
+ Navigating and developing around the nuances of different databases can be daunting, but you are not alone. Visit [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel for additional help beyond the documentation.
+
+### All databases are not the same
+
+There's a tremendous amount of work that goes into creating a database. Here is a high-level list of typical database layers (from the outermost layer moving inwards):
+- SQL API
+- Client Library / Driver
+- Server Connection Manager
+- Query parser
+- Query optimizer
+- Runtime
+- Storage Access Layer
+- Storage
+
+There's a lot more there than just SQL as a language. Databases (and data warehouses) are so popular because you can abstract away a great deal of the complexity from your brain to the database itself. This enables you to focus more on the data.
+
+dbt allows for further abstraction and standardization of the outermost layers of a database (SQL API, client library, connection manager) into a framework that both:
+ - Opens database technology to less technical users (a large swath of a DBA's role has been automated, similar to how the vast majority of folks with websites today no longer have to be "[webmasters](https://en.wikipedia.org/wiki/Webmaster)").
+ - Enables more meaningful conversations about how data warehousing should be done.
+
+This is where dbt adapters become critical.
+
+### What needs to be adapted?
+
+dbt adapters are responsible for _adapting_ dbt's standard functionality to a particular database. Our prototypical database and adapter are PostgreSQL and dbt-postgres, and most of our adapters are somewhat based on the functionality described in dbt-postgres.
+
+Connecting dbt to a new database will require a new adapter to be built or an existing adapter to be extended.
+
+The outermost layers of a database map roughly to the areas in which the dbt adapter framework encapsulates inter-database differences.
+
+### SQL API
+
+Even amongst ANSI-compliant databases, there are differences in the SQL grammar.
+Here are some categories and examples of SQL statements that can be constructed differently:
+
+
+| Category | Area of differences | Examples |
+|----------------------------------------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Statement syntax | The use of `IF EXISTS` | - `IF
EXISTS, DROP TABLE`- `DROP
IF EXISTS` |
+| Workflow definition & semantics | Incremental updates | - `MERGE`
- `DELETE; INSERT`
|
+| Relation and column attributes/configuration | Database-specific materialization configs | - `DIST = ROUND_ROBIN` (Synapse)
- `DIST = EVEN` (Redshift)
|
+| Permissioning | Grant statements that can only take one grantee at a time vs those that accept lists of grantees | - `grant SELECT on table dinner.corn to corn_kid, everyone`
- `grant SELECT on table dinner.corn to corn_kid; grant SELECT on table dinner.corn to everyone`
|
+
+### Python Client Library & Connection Manager
+
+The other big category of inter-database differences comes with how the client connects to the database and executes queries against the connection. To integrate with dbt, a data platform must have a pre-existing python client library or support ODBC, using a generic python library like pyodbc.
+
+| Category | Area of differences | Examples |
+|------------------------------|-------------------------------------------|-------------------------------------------------------------------------------------------------------------|
+| Credentials & authentication | Authentication | - Username & password
- MFA with `boto3` or Okta token
|
+| Connection opening/closing | Create a new connection to db |- `psycopg2.connect(connection_string)`
- `google.cloud.bigquery.Client(...)`
|
+| Inserting local data | Load seed .`csv` files into Python memory |- `google.cloud.bigquery.Client.load_table_from_file(...)` (BigQuery)
- `INSERT ... INTO VALUES ...` prepared statement (most other databases)
|
+
+
+### How dbt encapsulates and abstracts these differences
+
+Differences between databases are encoded into discrete areas:
+
+| Components | Code Path | Function |
+|------------------|---------------------------------------------------|-------------------------------------------------------------------------------|
+| Python Classes | `adapters/` | Configuration (See above [Python classes](##python classes) |
+| Macros | `include//macros/adapters/` | SQL API & statement syntax (for example, how to create schema or how to get table info) |
+| Materializations | `include//macros/materializations/` | Table/view/snapshot/ workflow definitions |
+
+
+#### Python Classes
+
+These classes implement all the methods responsible for:
+- Connecting to a database and issuing queries.
+- Providing dbt with database-specific configuration information.
+
+| Class | Description |
+|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| AdapterClass | High-level configuration type conversion and any database-specific python methods needed |
+| AdapterCredentials | Typed dictionary of possible profiles and associated methods |
+| AdapterConnectionManager | All the methods responsible for connecting to a database and issuing queries |
+| AdapterRelation | How relation names should be rendered, printed, and quoted. Do relation names use all three parts? `catalog.model_name` (two-part name) or `database.schema.model_name` (three-part name) |
+| AdapterColumn | How names should be rendered, and database-specific properties |
+
+#### Macros
+
+A set of *macros* responsible for generating SQL that is compliant with the target database.
+
+#### Materializations
+
+A set of *materializations* and their corresponding helper macros defined in dbt using jinja and SQL. They codify for dbt how model files should be persisted into the database.
+
+### Adapter Architecture
+
+
+Below is a diagram of how dbt-postgres, the adapter at the center of dbt-core, works.
+
+
+
+## Prerequisites
+
+It is very important that you have the right skills, and understand the level of difficulty required to make an adapter for your data platform.
+
+The more you can answer Yes to the below questions, the easier your adapter development (and user-) experience will be. See the [New Adapter Information Sheet wiki](https://github.com/dbt-labs/dbt-core/wiki/New-Adapter-Information-Sheet) for even more specific questions.
+
+### Training
+
+- the developer (and any product managers) ideally will have substantial experience as an end-user of dbt. If not, it is highly advised that you at least take the [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) and [Advanced Materializations](https://courses.getdbt.com/courses/advanced-materializations) course.
+
+### Database
+
+- Does the database complete transactions fast enough for interactive development?
+- Can you execute SQL against the data platform?
+- Is there a concept of schemas?
+- Does the data platform support ANSI SQL, or at least a subset?
+
+### Driver / Connection Library
+
+- Is there a Python-based driver for interacting with the database that is db API 2.0 compliant (e.g. Psycopg2 for Postgres, pyodbc for SQL Server)
+- Does it support: prepared statements, multiple statements, or single sign on token authorization to the data platform?
+
+### Open source software
+
+- Does your organization have an established process for publishing open source software?
+
+It is easiest to build an adapter for dbt when the following the /platform in question has:
+
+- a conventional ANSI-SQL interface (or as close to it as possible),
+- a mature connection library/SDK that uses ODBC or Python DB 2 API, and
+- a way to enable developers to iterate rapidly with both quick reads and writes
+
+### Maintaining your new adapter
+
+When your adapter becomes more popular, and people start using it, you may quickly become the maintainer of an increasingly popular open source project. With this new role, comes some unexpected responsibilities that not only include code maintenance, but also working with a community of users and contributors. To help people understand what to expect of your project, you should communicate your intentions early and often in your adapter documentation or README. Answer questions like, Is this experimental work that people should use at their own risk? Or is this production-grade code that you're committed to maintaining into the future?
+
+#### Keeping the code compatible with dbt Core
+
+New minor version releases of `dbt-core` may include changes to the Python interface for adapter plugins, as well as new or updated test cases. The maintainers of `dbt-core` will clearly communicate these changes in documentation and release notes, and they will aim for backwards compatibility whenever possible.
+
+Patch releases of `dbt-core` will _not_ include breaking changes to adapter-facing code. For more details, see ["About dbt Core versions"](/docs/dbt-versions/core).
+
+#### Versioning and releasing your adapter
+
+We strongly encourage you to adopt the following approach when versioning and releasing your plugin:
+
+- The minor version of your plugin should match the minor version in `dbt-core` (e.g. 1.1.x).
+- Aim to release a new version of your plugin for each new minor version of `dbt-core` (once every three months).
+- While your plugin is new, and you're iterating on features, aim to offer backwards compatibility and deprecation notices for at least one minor version. As your plugin matures, aim to leave backwards compatibility and deprecation notices in place until the next major version (dbt Core v2).
+- Release patch versions of your plugins whenever needed. These patch releases should contain fixes _only_.
+
+## Build a new adapter
+
+This step will walk you through the first creating the necessary adapter classes and macros, and provide some resources to help you validate that your new adapter is working correctly. Make sure you've familiarized yourself with the previous steps in this guide.
+
+Once the adapter is passing most of the functional tests in the previous "Testing a new adapter" step, please let the community know that is available to use by adding the adapter to the ["Supported Data Platforms"](/docs/supported-data-platforms) page by following the steps given in "Documenting your adapter.
+
+For any questions you may have, don't hesitate to ask in the [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel. The community is very helpful and likely has experienced a similar issue as you.
+
+### Scaffolding a new adapter
+
+ To create a new adapter plugin from scratch, you can use the [dbt-database-adapter-scaffold](https://github.com/dbt-labs/dbt-database-adapter-scaffold) to trigger an interactive session which will generate a scaffolding for you to build upon.
+
+ Example usage:
+
+ ```
+ $ cookiecutter gh:dbt-labs/dbt-database-adapter-scaffold
+ ```
+
+The generated boilerplate starting project will include a basic adapter plugin file structure, examples of macros, high level method descriptions, etc.
+
+One of the most important choices you will make during the cookiecutter generation will revolve around the field for `is_sql_adapter` which is a boolean used to correctly apply imports for either a `SQLAdapter` or `BaseAdapter`. Knowing which you will need requires a deeper knowledge of your selected database but a few good guides for the choice are.
+
+- Does your database have a complete SQL API? Can it perform tasks using SQL such as creating schemas, dropping schemas, querying an `information_schema` for metadata calls? If so, it is more likely to be a SQLAdapter where you set `is_sql_adapter` to `True`.
+- Most adapters do fall under SQL adapters which is why we chose it as the default `True` value.
+- It is very possible to build out a fully functional `BaseAdapter`. This will require a little more ground work as it doesn't come with some prebuilt methods the `SQLAdapter` class provides. See `dbt-bigquery` as a good guide.
+
+### Implementation Details
+
+Regardless if you decide to use the cookiecutter template or manually create the plugin, this section will go over each method that is required to be implemented. The table below provides a high-level overview of the classes, methods, and macros you may have to define for your data platform.
+
+| file | component | purpose |
+|---------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `./setup.py` | `setup()` function | adapter meta-data (package name, version, author, homepage, etc) |
+| `myadapter/dbt/adapters/myadapter/__init__.py` | `AdapterPlugin` | bundle all the information below into a dbt plugin |
+| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterCredentials` class | parameters to connect to and configure the database, via a the chosen Python driver |
+| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterConnectionManager` class | telling dbt how to interact with the database w.r.t opening/closing connections, executing queries, and fetching data. Effectively a wrapper around the db API or driver. |
+| `myadapter/dbt/include/bigquery/` | a dbt project of macro "overrides" in the format of "myadapter__" | any differences in SQL syntax for regular db operations will be modified here from the global_project (e.g. "Create Table As Select", "Get all relations in the current schema", etc) |
+| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterConfig` | database- and relation-level configs and |
+| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterAdapter` | for changing _how_ dbt performs operations like macros and other needed Python functionality |
+| `myadapter/dbt/adapters/myadapter/column.py` | `MyAdapterColumn` | for defining database-specific column such as datatype mappings |
+
+### Editing `setup.py`
+
+Edit the file at `myadapter/setup.py` and fill in the missing information.
+
+You can skip this step if you passed the arguments for `email`, `url`, `author`, and `dependencies` to the cookiecutter template script. If you plan on having nested macro folder structures, you may need to add entries to `package_data` so your macro source files get installed.
+
+### Editing the connection manager
+
+Edit the connection manager at `myadapter/dbt/adapters/myadapter/connections.py`. This file is defined in the sections below.
+
+#### The Credentials class
+
+The credentials class defines all of the database-specific credentials (e.g. `username` and `password`) that users will need in the [connection profile](/docs/supported-data-platforms) for your new adapter. Each credentials contract should subclass dbt.adapters.base.Credentials, and be implemented as a python dataclass.
+
+Note that the base class includes required database and schema fields, as dbt uses those values internally.
+
+For example, if your adapter requires a host, integer port, username string, and password string, but host is the only required field, you'd add definitions for those new properties to the class as types, like this:
+
+
+
+```python
+
+from dataclasses import dataclass
+from typing import Optional
+
+from dbt.adapters.base import Credentials
+
+
+@dataclass
+class MyAdapterCredentials(Credentials):
+ host: str
+ port: int = 1337
+ username: Optional[str] = None
+ password: Optional[str] = None
+
+ @property
+ def type(self):
+ return 'myadapter'
+
+ @property
+ def unique_field(self):
+ """
+ Hashed and included in anonymous telemetry to track adapter adoption.
+ Pick a field that can uniquely identify one team/organization building with this adapter
+ """
+ return self.host
+
+ def _connection_keys(self):
+ """
+ List of keys to display in the `dbt debug` output.
+ """
+ return ('host', 'port', 'database', 'username')
+```
+
+
+
+There are a few things you can do to make it easier for users when connecting to your database:
+
+- Be sure to implement the Credentials' `_connection_keys` method shown above. This method will return the keys that should be displayed in the output of the `dbt debug` command. As a general rule, it's good to return all the arguments used in connecting to the actual database except the password (even optional arguments).
+- Create a `profile_template.yml` to enable configuration prompts for a brand-new user setting up a connection profile via the [`dbt init` command](/reference/commands/init). You will find more details in the following steps.
+- You may also want to define an `ALIASES` mapping on your Credentials class to include any config names you want users to be able to use in place of 'database' or 'schema'. For example if everyone using the MyAdapter database calls their databases "collections", you might do:
+
+
+
+```python
+@dataclass
+class MyAdapterCredentials(Credentials):
+ host: str
+ port: int = 1337
+ username: Optional[str] = None
+ password: Optional[str] = None
+
+ ALIASES = {
+ 'collection': 'database',
+ }
+```
+
+
+
+Then users can use `collection` OR `database` in their `profiles.yml`, `dbt_project.yml`, or `config()` calls to set the database.
+
+#### `ConnectionManager` class methods
+
+Once credentials are configured, you'll need to implement some connection-oriented methods. They are enumerated in the SQLConnectionManager docstring, but an overview will also be provided here.
+
+**Methods to implement:**
+
+- `open`
+- `get_response`
+- `cancel`
+- `exception_handler`
+- `standardize_grants_dict`
+
+##### `open(cls, connection)`
+
+`open()` is a classmethod that gets a connection object (which could be in any state, but will have a `Credentials` object with the attributes you defined above) and moves it to the 'open' state.
+
+Generally this means doing the following:
+ - if the connection is open already, log and return it.
+ - If a database needed changes to the underlying connection before re-use, that would happen here
+ - create a connection handle using the underlying database library using the credentials
+ - on success:
+ - set connection.state to `'open'`
+ - set connection.handle to the handle object
+ - this is what must have a `cursor()` method that returns a cursor!
+ - on error:
+ - set connection.state to `'fail'`
+ - set connection.handle to `None`
+ - raise a `dbt.exceptions.FailedToConnectException` with the error and any other relevant information
+
+For example:
+
+
+
+```python
+ @classmethod
+ def open(cls, connection):
+ if connection.state == 'open':
+ logger.debug('Connection is already open, skipping open.')
+ return connection
+
+ credentials = connection.credentials
+
+ try:
+ handle = myadapter_library.connect(
+ host=credentials.host,
+ port=credentials.port,
+ username=credentials.username,
+ password=credentials.password,
+ catalog=credentials.database
+ )
+ connection.state = 'open'
+ connection.handle = handle
+ return connection
+```
+
+
+
+##### `get_response(cls, cursor)`
+
+`get_response` is a classmethod that gets a cursor object and returns adapter-specific information about the last executed command. The return value should be an `AdapterResponse` object that includes items such as `code`, `rows_affected`, `bytes_processed`, and a summary `_message` for logging to stdout.
+
+
+
+```python
+ @classmethod
+ def get_response(cls, cursor) -> AdapterResponse:
+ code = cursor.sqlstate or "OK"
+ rows = cursor.rowcount
+ status_message = f"{code} {rows}"
+ return AdapterResponse(
+ _message=status_message,
+ code=code,
+ rows_affected=rows
+ )
+```
+
+
+
+##### `cancel(self, connection)`
+
+`cancel` is an instance method that gets a connection object and attempts to cancel any ongoing queries, which is database dependent. Some databases don't support the concept of cancellation, they can simply implement it via 'pass' and their adapter classes should implement an `is_cancelable` that returns False - On ctrl+c connections may remain running. This method must be implemented carefully, as the affected connection will likely be in use in a different thread.
+
+
+
+```python
+ def cancel(self, connection):
+ tid = connection.handle.transaction_id()
+ sql = 'select cancel_transaction({})'.format(tid)
+ logger.debug("Cancelling query '{}' ({})".format(connection_name, pid))
+ _, cursor = self.add_query(sql, 'master')
+ res = cursor.fetchone()
+ logger.debug("Canceled query '{}': {}".format(connection_name, res))
+```
+
+
+
+##### `exception_handler(self, sql, connection_name='master')`
+
+`exception_handler` is an instance method that returns a context manager that will handle exceptions raised by running queries, catch them, log appropriately, and then raise exceptions dbt knows how to handle.
+
+If you use the (highly recommended) `@contextmanager` decorator, you only have to wrap a `yield` inside a `try` block, like so:
+
+
+
+```python
+ @contextmanager
+ def exception_handler(self, sql: str):
+ try:
+ yield
+ except myadapter_library.DatabaseError as exc:
+ self.release(connection_name)
+
+ logger.debug('myadapter error: {}'.format(str(e)))
+ raise dbt.exceptions.DatabaseException(str(exc))
+ except Exception as exc:
+ logger.debug("Error running SQL: {}".format(sql))
+ logger.debug("Rolling back transaction.")
+ self.release(connection_name)
+ raise dbt.exceptions.RuntimeException(str(exc))
+```
+
+
+
+##### `standardize_grants_dict(self, grants_table: agate.Table) -> dict`
+
+`standardize_grants_dict` is an method that returns the dbt-standardized grants dictionary that matches how users configure grants now in dbt. The input is the result of `SHOW GRANTS ON {{model}}` call loaded into an agate table.
+
+If there's any massaging of agate table containing the results, of `SHOW GRANTS ON {{model}}`, that can't easily be accomplished in SQL, it can be done here. For example, the SQL to show grants _should_ filter OUT any grants TO the current user/role (e.g. OWNERSHIP). If that's not possible in SQL, it can be done in this method instead.
+
+
+
+```python
+ @available
+ def standardize_grants_dict(self, grants_table: agate.Table) -> dict:
+ """
+ :param grants_table: An agate table containing the query result of
+ the SQL returned by get_show_grant_sql
+ :return: A standardized dictionary matching the `grants` config
+ :rtype: dict
+ """
+ grants_dict: Dict[str, List[str]] = {}
+ for row in grants_table:
+ grantee = row["grantee"]
+ privilege = row["privilege_type"]
+ if privilege in grants_dict.keys():
+ grants_dict[privilege].append(grantee)
+ else:
+ grants_dict.update({privilege: [grantee]})
+ return grants_dict
+```
+
+
+
+### Editing the adapter implementation
+
+Edit the connection manager at `myadapter/dbt/adapters/myadapter/impl.py`
+
+Very little is required to implement the adapter itself. On some adapters, you will not need to override anything. On others, you'll likely need to override some of the ``convert_*`` classmethods, or override the `is_cancelable` classmethod on others to return `False`.
+
+#### `datenow()`
+
+This classmethod provides the adapter's canonical date function. This is not used but is required– anyway on all adapters.
+
+
+
+```python
+ @classmethod
+ def date_function(cls):
+ return 'datenow()'
+```
+
+
+
+### Editing SQL logic
+
+dbt implements specific SQL operations using jinja macros. While reasonable defaults are provided for many such operations (like `create_schema`, `drop_schema`, `create_table`, etc), you may need to override one or more of macros when building a new adapter.
+
+#### Required macros
+
+The following macros must be implemented, but you can override their behavior for your adapter using the "dispatch" pattern described below. Macros marked (required) do not have a valid default implementation, and are required for dbt to operate.
+
+- `alter_column_type` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L37-L55))
+- `check_schema_exists` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L43-L55))
+- `create_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L1-L9))
+- `drop_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L34-L42))
+- `drop_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L12-L20))
+- `get_columns_in_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L1-L8)) (required)
+- `list_relations_without_caching` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L58-L65)) (required)
+- `list_schemas` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L29-L40))
+- `rename_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L56-L65))
+- `truncate_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L45-L53))
+- `current_timestamp` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/freshness.sql#L1-L8)) (required)
+- `copy_grants`
+
+#### Adapter dispatch
+
+Most modern databases support a majority of the standard SQL spec. There are some databases that _do not_ support critical aspects of the SQL spec however, or they provide their own nonstandard mechanisms for implementing the same functionality. To account for these variations in SQL support, dbt provides a mechanism called [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch) for macros. With this feature, macros can be overridden for specific adapters. This makes it possible to implement high-level methods (like "create ") in a database-specific way.
+
+
+
+```jinja2
+
+{# dbt will call this macro by name, providing any arguments #}
+{% macro create_table_as(temporary, relation, sql) -%}
+
+ {# dbt will dispatch the macro call to the relevant macro #}
+ {{ return(
+ adapter.dispatch('create_table_as')(temporary, relation, sql)
+ ) }}
+{%- endmacro %}
+
+
+
+{# If no macro matches the specified adapter, "default" will be used #}
+{% macro default__create_table_as(temporary, relation, sql) -%}
+ ...
+{%- endmacro %}
+
+
+
+{# Example which defines special logic for Redshift #}
+{% macro redshift__create_table_as(temporary, relation, sql) -%}
+ ...
+{%- endmacro %}
+
+
+
+{# Example which defines special logic for BigQuery #}
+{% macro bigquery__create_table_as(temporary, relation, sql) -%}
+ ...
+{%- endmacro %}
+```
+
+
+
+The `adapter.dispatch()` macro takes a second argument, `packages`, which represents a set of "search namespaces" in which to find potential implementations of a dispatched macro. This allows users of community-supported adapters to extend or "shim" dispatched macros from common packages, such as `dbt-utils`, with adapter-specific versions in their own project or other installed packages. See:
+
+- "Shim" package examples: [`spark-utils`](https://github.com/dbt-labs/spark-utils), [`tsql-utils`](https://github.com/dbt-msft/tsql-utils)
+- [`adapter.dispatch` docs](/reference/dbt-jinja-functions/dispatch)
+
+#### Overriding adapter methods
+
+While much of dbt's adapter-specific functionality can be modified in adapter macros, it can also make sense to override adapter methods directly. In this example, assume that a database does not support a `cascade` parameter to `drop schema`. Instead, we can implement an approximation where we drop each relation and then drop the schema.
+
+
+
+```python
+ def drop_schema(self, relation: BaseRelation):
+ relations = self.list_relations(
+ database=relation.database,
+ schema=relation.schema
+ )
+ for relation in relations:
+ self.drop_relation(relation)
+ super().drop_schema(relation)
+```
+
+
+
+#### Grants Macros
+
+See [this GitHub discussion](https://github.com/dbt-labs/dbt-core/discussions/5468) for information on the macros required for `GRANT` statements:
+
+### Other files
+
+#### `profile_template.yml`
+
+In order to enable the [`dbt init` command](/reference/commands/init) to prompt users when setting up a new project and connection profile, you should include a **profile template**. The filepath needs to be `dbt/include//profile_template.yml`. It's possible to provide hints, default values, and conditional prompts based on connection methods that require different supporting attributes. Users will also be able to include custom versions of this file in their own projects, with fixed values specific to their organization, to support their colleagues when using your dbt adapter for the first time.
+
+See examples:
+
+- [dbt-postgres](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml)
+- [dbt-redshift](https://github.com/dbt-labs/dbt-redshift/blob/main/dbt/include/redshift/profile_template.yml)
+- [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/include/snowflake/profile_template.yml)
+- [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery/blob/main/dbt/include/bigquery/profile_template.yml)
+
+#### `__version__.py`
+
+To assure that `dbt --version` provides the latest dbt core version the adapter supports, be sure include a `__version__.py` file. The filepath will be `dbt/adapters//__version__.py`. We recommend using the latest dbt core version and as the adapter is made compatible with later versions, this file will need to be updated. For a sample file, check out this [example](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/adapters/snowflake/__version__.py).
+
+It should be noted that both of these files are included in the bootstrapped output of the `dbt-database-adapter-scaffold` so when using the scaffolding, these files will be included.
+
+## Test your adapter
+
+:::info
+
+Previously, we offered a packaged suite of tests for dbt adapter functionality: [`pytest-dbt-adapter`](https://github.com/dbt-labs/dbt-adapter-tests). We are deprecating that suite, in favor of the newer testing framework outlined in this document.
+
+:::
+
+This document has two sections:
+
+1. Refer to "About the testing framework" for a description of the standard framework that we maintain for using pytest together with dbt. It includes an example that shows the anatomy of a simple test case.
+2. Refer to "Testing your adapter" for a step-by-step guide for using our out-of-the-box suite of "basic" tests, which will validate that your adapter meets a baseline of dbt functionality.
+
+### Testing prerequisites
+
+- Your adapter must be compatible with dbt-core **v1.1** or newer
+- You should be familiar with **pytest**:
+
+### About the testing framework
+
+dbt-core offers a standard framework for running pre-built functional tests, and for defining your own tests. The core testing framework is built using `pytest`, a mature and standard library for testing Python projects.
+
+The **[`tests` module](https://github.com/dbt-labs/dbt-core/tree/HEAD/core/dbt/tests)** within `dbt-core` includes basic utilities for setting up pytest + dbt. These are used by all "pre-built" functional tests, and make it possible to quickly write your own tests.
+
+Those utilities allow you to do three basic things:
+
+1. **Quickly set up a dbt "project."** Define project resources via methods such as `models()` and `seeds()`. Use `project_config_update()` to pass configurations into `dbt_project.yml`.
+2. **Define a sequence of dbt commands.** The most important utility is `run_dbt()`, which returns the [results](/reference/dbt-classes#result-objects) of each dbt command. It takes a list of CLI specifiers (subcommand + flags), as well as an optional second argument, `expect_pass=False`, for cases where you expect the command to fail.
+3. **Validate the results of those dbt commands.** For example, `check_relations_equal()` asserts that two database objects have the same structure and content. You can also write your own `assert` statements, by inspecting the results of a dbt command, or querying arbitrary database objects with `project.run_sql()`.
+
+You can see the full suite of utilities, with arguments and annotations, in [`util.py`](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/tests/util.py). You'll also see them crop up across a number of test cases. While all utilities are intended to be reusable, you won't need all of them for every test. In the example below, we'll show a simple test case that uses only a few utilities.
+
+#### Example: a simple test case
+
+This example will show you the anatomy of a test case using dbt + pytest. We will create reusable components, combine them to form a dbt "project", and define a sequence of dbt commands. Then, we'll use Python `assert` statements to ensure those commands succeed (or fail) as we expect.
+
+In ["Getting started running basic tests,"](#getting-started-running-basic-tests) we'll offer step-by-step instructions for installing and configuring `pytest`, so that you can run it on your own machine. For now, it's more important to see how the pieces of a test case fit together.
+
+This example includes a seed, a model, and two tests—one of which will fail.
+
+1. Define Python strings that will represent the file contents in your dbt project. Defining these in a separate file enables you to reuse the same components across different test cases. The pytest name for this type of reusable component is "fixture."
+
+
+
+```python
+# seeds/my_seed.csv
+my_seed_csv = """
+id,name,some_date
+1,Easton,1981-05-20T06:46:51
+2,Lillian,1978-09-03T18:10:33
+3,Jeremiah,1982-03-11T03:59:51
+4,Nolan,1976-05-06T20:21:35
+""".lstrip()
+
+# models/my_model.sql
+my_model_sql = """
+select * from {{ ref('my_seed') }}
+union all
+select null as id, null as name, null as some_date
+"""
+
+# models/my_model.yml
+my_model_yml = """
+version: 2
+models:
+ - name: my_model
+ columns:
+ - name: id
+ tests:
+ - unique
+ - not_null # this test will fail
+"""
+```
+
+
+
+2. Use the "fixtures" to define the project for your test case. These fixtures are always scoped to the **class**, where the class represents one test case—that is, one dbt project or scenario. (The same test case can be used for one or more actual tests, which we'll see in step 3.) Following the default pytest configurations, the file name must begin with `test_`, and the class name must begin with `Test`.
+
+
+
+```python
+import pytest
+from dbt.tests.util import run_dbt
+
+# our file contents
+from tests.functional.example.fixtures import (
+ my_seed_csv,
+ my_model_sql,
+ my_model_yml,
+)
+
+# class must begin with 'Test'
+class TestExample:
+ """
+ Methods in this class will be of two types:
+ 1. Fixtures defining the dbt "project" for this test case.
+ These are scoped to the class, and reused for all tests in the class.
+ 2. Actual tests, whose names begin with 'test_'.
+ These define sequences of dbt commands and 'assert' statements.
+ """
+
+ # configuration in dbt_project.yml
+ @pytest.fixture(scope="class")
+ def project_config_update(self):
+ return {
+ "name": "example",
+ "models": {"+materialized": "view"}
+ }
+
+ # everything that goes in the "seeds" directory
+ @pytest.fixture(scope="class")
+ def seeds(self):
+ return {
+ "my_seed.csv": my_seed_csv,
+ }
+
+ # everything that goes in the "models" directory
+ @pytest.fixture(scope="class")
+ def models(self):
+ return {
+ "my_model.sql": my_model_sql,
+ "my_model.yml": my_model_yml,
+ }
+
+ # continues below
+```
+
+
+
+3. Now that we've set up our project, it's time to define a sequence of dbt commands and assertions. We define one or more methods in the same file, on the same class (`TestExampleFailingTest`), whose names begin with `test_`. These methods share the same setup (project scenario) from above, but they can be run independently by pytest—so they shouldn't depend on each other in any way.
+
+
+
+```python
+ # continued from above
+
+ # The actual sequence of dbt commands and assertions
+ # pytest will take care of all "setup" + "teardown"
+ def test_run_seed_test(self, project):
+ """
+ Seed, then run, then test. We expect one of the tests to fail
+ An alternative pattern is to use pytest "xfail" (see below)
+ """
+ # seed seeds
+ results = run_dbt(["seed"])
+ assert len(results) == 1
+ # run models
+ results = run_dbt(["run"])
+ assert len(results) == 1
+ # test tests
+ results = run_dbt(["test"], expect_pass = False) # expect failing test
+ assert len(results) == 2
+ # validate that the results include one pass and one failure
+ result_statuses = sorted(r.status for r in results)
+ assert result_statuses == ["fail", "pass"]
+
+ @pytest.mark.xfail
+ def test_build(self, project):
+ """Expect a failing test"""
+ # do it all
+ results = run_dbt(["build"])
+```
+
+
+
+3. Our test is ready to run! The last step is to invoke `pytest` from your command line. We'll walk through the actual setup and configuration of `pytest` in the next section.
+
+
+
+```sh
+$ python3 -m pytest tests/functional/test_example.py
+=========================== test session starts ============================
+platform ... -- Python ..., pytest-..., pluggy-...
+rootdir: ...
+plugins: ...
+
+tests/functional/test_example.py .X [100%]
+
+======================= 1 passed, 1 xpassed in 1.38s =======================
+```
+
+
+
+You can find more ways to run tests, along with a full command reference, in the [pytest usage docs](https://docs.pytest.org/how-to/usage.html).
+
+We've found the `-s` flag (or `--capture=no`) helpful to print logs from the underlying dbt invocations, and to step into an interactive debugger if you've added one. You can also use environment variables to set [global dbt configs](/reference/global-configs/about-global-configs), such as `DBT_DEBUG` (to show debug-level logs).
+
+### Testing this adapter
+
+Anyone who installs `dbt-core`, and wishes to define their own test cases, can use the framework presented in the first section. The framework is especially useful for testing standard dbt behavior across different databases.
+
+To that end, we have built and made available a [package of reusable adapter test cases](https://github.com/dbt-labs/dbt-core/tree/HEAD/tests/adapter), for creators and maintainers of adapter plugins. These test cases cover basic expected functionality, as well as functionality that frequently requires different implementations across databases.
+
+For the time being, this package is also located within the `dbt-core` repository, but separate from the `dbt-core` Python package.
+
+### Categories of tests
+
+In the course of creating and maintaining your adapter, it's likely that you will end up implementing tests that fall into three broad categories:
+
+1. **Basic tests** that every adapter plugin is expected to pass. These are defined in `tests.adapter.basic`. Given differences across data platforms, these may require slight modification or reimplementation. Significantly overriding or disabling these tests should be with good reason, since each represents basic functionality expected by dbt users. For example, if your adapter does not support incremental models, you should disable the test, [by marking it with `skip` or `xfail`](https://docs.pytest.org/en/latest/how-to/skipping.html), as well as noting that limitation in any documentation, READMEs, and usage guides that accompany your adapter.
+
+2. **Optional tests**, for second-order functionality that is common across plugins, but not required for basic use. Your plugin can opt into these test cases by inheriting existing ones, or reimplementing them with adjustments. For now, this category includes all tests located outside the `basic` subdirectory. More tests will be added as we convert older tests defined on dbt-core and mature plugins to use the standard framework.
+
+3. **Custom tests**, for behavior that is specific to your adapter / data platform. Each has its own specialties and idiosyncracies. We encourage you to use the same `pytest`-based framework, utilities, and fixtures to write your own custom tests for functionality that is unique to your adapter.
+
+If you run into an issue with the core framework, or the basic/optional test cases—or if you've written a custom test that you believe would be relevant and useful for other adapter plugin developers—please open an issue or PR in the `dbt-core` repository on GitHub.
+
+### Getting started running basic tests
+
+In this section, we'll walk through the three steps to start running our basic test cases on your adapter plugin:
+
+1. Install dependencies
+2. Set up and configure pytest
+3. Define test cases
+
+### Install dependencies
+
+You should already have a virtual environment with `dbt-core` and your adapter plugin installed. You'll also need to install:
+
+- [`pytest`](https://pypi.org/project/pytest/)
+- [`dbt-tests-adapter`](https://pypi.org/project/dbt-tests-adapter/), the set of common test cases
+- (optional) [`pytest` plugins](https://docs.pytest.org/en/7.0.x/reference/plugin_list.html)--we'll use `pytest-dotenv` below
+
+Or specify all dependencies in a requirements file like:
+
+
+```txt
+pytest
+pytest-dotenv
+dbt-tests-adapter
+```
+
+
+
+```sh
+pip install -r dev_requirements.txt
+```
+
+### Set up and configure pytest
+
+First, set yourself up to run `pytest` by creating a file named `pytest.ini` at the root of your repository:
+
+
+
+```python
+[pytest]
+filterwarnings =
+ ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning
+ ignore:unclosed file .*:ResourceWarning
+env_files =
+ test.env # uses pytest-dotenv plugin
+ # this allows you to store env vars for database connection in a file named test.env
+ # rather than passing them in every CLI command, or setting in `PYTEST_ADDOPTS`
+ # be sure to add "test.env" to .gitignore as well!
+testpaths =
+ tests/functional # name per convention
+```
+
+
+
+Then, create a configuration file within your tests directory. In it, you'll want to define all necessary profile configuration for connecting to your data platform in local development and continuous integration. We recommend setting these values with environment variables, since this file will be checked into version control.
+
+
+
+```python
+import pytest
+import os
+
+# Import the standard functional fixtures as a plugin
+# Note: fixtures with session scope need to be local
+pytest_plugins = ["dbt.tests.fixtures.project"]
+
+# The profile dictionary, used to write out profiles.yml
+# dbt will supply a unique schema per test, so we do not specify 'schema' here
+@pytest.fixture(scope="class")
+def dbt_profile_target():
+ return {
+ 'type': '',
+ 'threads': 1,
+ 'host': os.getenv('HOST_ENV_VAR_NAME'),
+ 'user': os.getenv('USER_ENV_VAR_NAME'),
+ ...
+ }
+```
+
+
+
+### Define test cases
+
+As in the example above, each test case is defined as a class, and has its own "project" setup. To get started, you can import all basic test cases and try running them without changes.
+
+
+
+```python
+import pytest
+
+from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations
+from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests
+from dbt.tests.adapter.basic.test_singular_tests_ephemeral import BaseSingularTestsEphemeral
+from dbt.tests.adapter.basic.test_empty import BaseEmpty
+from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral
+from dbt.tests.adapter.basic.test_incremental import BaseIncremental
+from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests
+from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols
+from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp
+from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod
+
+class TestSimpleMaterializationsMyAdapter(BaseSimpleMaterializations):
+ pass
+
+
+class TestSingularTestsMyAdapter(BaseSingularTests):
+ pass
+
+
+class TestSingularTestsEphemeralMyAdapter(BaseSingularTestsEphemeral):
+ pass
+
+
+class TestEmptyMyAdapter(BaseEmpty):
+ pass
+
+
+class TestEphemeralMyAdapter(BaseEphemeral):
+ pass
+
+
+class TestIncrementalMyAdapter(BaseIncremental):
+ pass
+
+
+class TestGenericTestsMyAdapter(BaseGenericTests):
+ pass
+
+
+class TestSnapshotCheckColsMyAdapter(BaseSnapshotCheckCols):
+ pass
+
+
+class TestSnapshotTimestampMyAdapter(BaseSnapshotTimestamp):
+ pass
+
+
+class TestBaseAdapterMethod(BaseAdapterMethod):
+ pass
+```
+
+
+
+Finally, run pytest:
+
+```sh
+python3 -m pytest tests/functional
+```
+
+### Modifying test cases
+
+You may need to make slight modifications in a specific test case to get it passing on your adapter. The mechanism to do this is simple: rather than simply inheriting the "base" test with `pass`, you can redefine any of its fixtures or test methods.
+
+For instance, on Redshift, we need to explicitly cast a column in the fixture input seed to use data type `varchar(64)`:
+
+
+
+```python
+import pytest
+from dbt.tests.adapter.basic.files import seeds_base_csv, seeds_added_csv, seeds_newcolumns_csv
+from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols
+
+# set the datatype of the name column in the 'added' seed so it
+# can hold the '_update' that's added
+schema_seed_added_yml = """
+version: 2
+seeds:
+ - name: added
+ config:
+ column_types:
+ name: varchar(64)
+"""
+
+class TestSnapshotCheckColsRedshift(BaseSnapshotCheckCols):
+ # Redshift defines the 'name' column such that it's not big enough
+ # to hold the '_update' added in the test.
+ @pytest.fixture(scope="class")
+ def models(self):
+ return {
+ "base.csv": seeds_base_csv,
+ "added.csv": seeds_added_csv,
+ "seeds.yml": schema_seed_added_yml,
+ }
+```
+
+
+
+As another example, the `dbt-bigquery` adapter asks users to "authorize" replacing a with a by supplying the `--full-refresh` flag. The reason: In the table logic, a view by the same name must first be dropped; if the table query fails, the model will be missing.
+
+Knowing this possibility, the "base" test case offers a `require_full_refresh` switch on the `test_config` fixture class. For BigQuery, we'll switch it on:
+
+
+
+```python
+import pytest
+from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations
+
+class TestSimpleMaterializationsBigQuery(BaseSimpleMaterializations):
+ @pytest.fixture(scope="class")
+ def test_config(self):
+ # effect: add '--full-refresh' flag in requisite 'dbt run' step
+ return {"require_full_refresh": True}
+```
+
+
+
+It's always worth asking whether the required modifications represent gaps in perceived or expected dbt functionality. Are these simple implementation details, which any user of this database would understand? Are they limitations worth documenting?
+
+If, on the other hand, they represent poor assumptions in the "basic" test cases, which fail to account for a common pattern in other types of databases-—please open an issue or PR in the `dbt-core` repository on GitHub.
+
+### Running with multiple profiles
+
+Some databases support multiple connection methods, which map to actually different functionality behind the scenes. For instance, the `dbt-spark` adapter supports connections to Apache Spark clusters _and_ Databricks runtimes, which supports additional functionality out of the box, enabled by the Delta file format.
+
+
+
+```python
+def pytest_addoption(parser):
+ parser.addoption("--profile", action="store", default="apache_spark", type=str)
+
+
+# Using @pytest.mark.skip_profile('apache_spark') uses the 'skip_by_profile_type'
+# autouse fixture below
+def pytest_configure(config):
+ config.addinivalue_line(
+ "markers",
+ "skip_profile(profile): skip test for the given profile",
+ )
+
+@pytest.fixture(scope="session")
+def dbt_profile_target(request):
+ profile_type = request.config.getoption("--profile")
+ elif profile_type == "databricks_sql_endpoint":
+ target = databricks_sql_endpoint_target()
+ elif profile_type == "apache_spark":
+ target = apache_spark_target()
+ else:
+ raise ValueError(f"Invalid profile type '{profile_type}'")
+ return target
+
+def apache_spark_target():
+ return {
+ "type": "spark",
+ "host": "localhost",
+ ...
+ }
+
+def databricks_sql_endpoint_target():
+ return {
+ "type": "spark",
+ "host": os.getenv("DBT_DATABRICKS_HOST_NAME"),
+ ...
+ }
+
+@pytest.fixture(autouse=True)
+def skip_by_profile_type(request):
+ profile_type = request.config.getoption("--profile")
+ if request.node.get_closest_marker("skip_profile"):
+ for skip_profile_type in request.node.get_closest_marker("skip_profile").args:
+ if skip_profile_type == profile_type:
+ pytest.skip("skipped on '{profile_type}' profile")
+```
+
+
+
+If there are tests that _shouldn't_ run for a given profile:
+
+
+
+```python
+# Snapshots require access to the Delta file format, available on our Databricks connection,
+# so let's skip on Apache Spark
+@pytest.mark.skip_profile('apache_spark')
+class TestSnapshotCheckColsSpark(BaseSnapshotCheckCols):
+ @pytest.fixture(scope="class")
+ def project_config_update(self):
+ return {
+ "seeds": {
+ "+file_format": "delta",
+ },
+ "snapshots": {
+ "+file_format": "delta",
+ }
+ }
+```
+
+
+
+Finally:
+
+```sh
+python3 -m pytest tests/functional --profile apache_spark
+python3 -m pytest tests/functional --profile databricks_sql_endpoint
+```
+
+## Document a new adapter
+
+If you've already built, and tested your adapter, it's time to document it so the dbt community will know that it exists and how to use it.
+
+### Making your adapter available
+
+Many community members maintain their adapter plugins under open source licenses. If you're interested in doing this, we recommend:
+
+- Hosting on a public git provider (for example, GitHub or Gitlab)
+- Publishing to [PyPI](https://pypi.org/)
+- Adding to the list of ["Supported Data Platforms"](/docs/supported-data-platforms#community-supported) (more info below)
+
+### General Guidelines
+
+To best inform the dbt community of the new adapter, you should contribute to the dbt's open-source documentation site, which uses the [Docusaurus project](https://docusaurus.io/). This is the site you're currently on!
+
+### Conventions
+
+Each `.md` file you create needs a header as shown below. The document id will also need to be added to the config file: `website/sidebars.js`.
+
+```md
+---
+title: "Documenting a new adapter"
+id: "documenting-a-new-adapter"
+---
+```
+
+### Single Source of Truth
+
+We ask our adapter maintainers to use the [docs.getdbt.com repo](https://github.com/dbt-labs/docs.getdbt.com) (i.e. this site) as the single-source-of-truth for documentation rather than having to maintain the same set of information in three different places. The adapter repo's `README.md` and the data platform's documentation pages should simply link to the corresponding page on this docs site. Keep reading for more information on what should and shouldn't be included on the dbt docs site.
+
+### Assumed Knowledge
+
+To simplify things, assume the reader of this documentation already knows how both dbt and your data platform works. There's already great material for how to learn dbt and the data platform out there. The documentation we're asking you to add should be what a user who is already profiecient in both dbt and your data platform would need to know in order to use both. Effectively that boils down to two things: how to connect, and how to configure.
+
+### Topics and Pages to Cover
+
+The following subjects need to be addressed across three pages of this docs site to have your data platform be listed on our documentation. After the corresponding pull request is merged, we ask that you link to these pages from your adapter repo's `REAMDE` as well as from your product documentation.
+
+ To contribute, all you will have to do make the changes listed in the table below.
+
+| How To... | File to change within `/website/docs/` | Action | Info to Include |
+|----------------------|--------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Connect | `/docs/core/connect-data-platform/{MY-DATA-PLATFORM}-setup.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. |
+| Configure | `reference/resource-configs/{MY-DATA-PLATFORM}-configs.md` | Create | What options and configuration specific to your data platform do users need to know? e.g. table distribution and indexing options, column_quoting policy, which incremental strategies are supported |
+| Discover and Install | `docs/supported-data-platforms.md` | Modify | Is it a vendor- or community- supported adapter? How to install Python adapter package? Ideally with pip and PyPI hosted package, but can also use `git+` link to GitHub Repo |
+| Add link to sidebar | `website/sidebars.js` | Modify | Add the document id to the correct location in the sidebar menu |
+
+For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-setup.md` and add it to the `/website/docs/core/connect-data-platform/` directory.
+
+### Example PRs to add new adapter documentation
+
+Below are some recent pull requests made by partners to document their data platform's adapter:
+
+- [TiDB](https://github.com/dbt-labs/docs.getdbt.com/pull/1309)
+- [SingleStore](https://github.com/dbt-labs/docs.getdbt.com/pull/1044)
+- [Firebolt](https://github.com/dbt-labs/docs.getdbt.com/pull/941)
+
+## Promote a new adapter
+
+The most important thing here is recognizing that people are successful in the community when they join, first and foremost, to engage authentically.
+
+What does authentic engagement look like? It’s challenging to define explicit rules. One good rule of thumb is to treat people with dignity and respect.
+
+Contributors to the community should think of contribution _as the end itself,_ not a means toward other business KPIs (leads, community members, etc.). [We are a mission-driven company.](https://www.getdbt.com/dbt-labs/values/) Some ways to know if you’re authentically engaging:
+
+- Is an engagement’s _primary_ purpose of sharing knowledge and resources or building brand engagement?
+- Imagine you didn’t work at the org you do — can you imagine yourself still writing this?
+- Is it written in formal / marketing language, or does it sound like you, the human?
+
+### Who should join the dbt community slack?
+
+- People who have insight into what it means to do hands-on [analytics engineering](https://www.getdbt.com/analytics-engineering/) work
+ The dbt Community Slack workspace is fundamentally a place for analytics practitioners to interact with each other — the closer the users are in the community to actual data/analytics engineering work, the more natural their engagement will be (leading to better outcomes for partners and the community).
+
+- DevRel practitioners with strong focus
+ DevRel practitioners often have a strong analytics background and a good understanding of the community. It’s essential to be sure they are focused on _contributing,_ not on driving community metrics for partner org (such as signing people up for their slack or events). The metrics will rise naturally through authentic engagement.
+
+- Founder and executives who are interested in directly engaging with the community
+ This is either incredibly successful or not at all depending on the profile of the founder. Typically, this works best when the founder has a practitioner-level of technical understanding and is interested in joining not to promote, but to learn and hear from users.
+
+- Software Engineers at partner products that are building and supporting integrations with either dbt Core or dbt Cloud
+ This is successful when the engineers are familiar with dbt as a product or at least have taken our training course. The Slack is often a place where end-user questions and feedback is initially shared, so it is recommended that someone technical from the team be present. There are also a handful of channels aimed at those building integrations, which tend to be a font of knowledge.
+
+### Who might struggle in the dbt community
+
+- People in marketing roles
+ dbt Slack is not a marketing channel. Attempts to use it as such invariably fall flat and can even lead to people having a negative view of a product. This doesn’t mean that dbt can’t serve marketing objectives, but a long-term commitment to engagement is the only proven method to do this sustainably.
+
+- People in product roles
+ The dbt Community can be an invaluable source of feedback on a product. There are two primary ways this can happen — organically (community members proactively suggesting a new feature) and via direct calls for feedback and user research. Immediate calls for engagement must be done in your dedicated #tools channel. Direct calls should be used sparingly, as they can overwhelm more organic discussions and feedback.
+
+### Who is the audience for an adapter release?
+
+ A new adapter is likely to drive huge community interest from several groups of people:
+ - People who are currently using the database that the adapter is supporting
+ - People who may be adopting the database in the near future.
+ - People who are interested in dbt development in general.
+
+The database users will be your primary audience and the most helpful in achieving success. Engage them directly in the adapter’s dedicated Slack channel. If one does not exist already, reach out in #channel-requests, and we will get one made for you and include it in an announcement about new channels.
+
+The final group is where non-slack community engagement becomes important. Twitter and LinkedIn are both great places to interact with a broad audience. A well-orchestrated adapter release can generate impactful and authentic engagement.
+
+### How to message the initial rollout and follow-up content
+
+Tell a story that engages dbt users and the community. Highlight new use cases and functionality unlocked by the adapter in a way that will resonate with each segment.
+
+- Existing users of your technology who are new to dbt
+ - Provide a general overview of the value dbt will deliver to your users. This can lean on dbt's messaging and talking points which are laid out in the [dbt viewpoint.](/community/resources/viewpoint)
+ - Give examples of a rollout that speaks to the overall value of dbt and your product.
+
+- Users who are already familiar with dbt and the community
+ - Consider unique use cases or advantages your adapter provide over existing adapters. Who will be excited for this?
+ - Contribute to the dbt Community and ensure that dbt users on your adapter are well supported (tutorial content, packages, documentation, etc).
+ - Example of a rollout that is compelling for those familiar with dbt: [Firebolt](https://www.linkedin.com/feed/update/urn:li:activity:6879090752459182080/)
+
+### Tactically manage distribution of content about new or existing adapters
+
+There are tactical pieces on how and where to share that help ensure success.
+
+- On slack:
+ - #i-made-this channel — this channel has a policy against “marketing” and “content marketing” posts, but it should be successful if you write your content with the above guidelines in mind. Even with that, it’s important to post here sparingly.
+ - Your own database / tool channel — this is where the people who have opted in to receive communications from you and always a great place to share things that are relevant to them.
+
+- On social media:
+ - Twitter
+ - LinkedIn
+ - Social media posts _from the author_ or an individual connected to the project tend to have better engagement than posts from a company or organization account.
+ - Ask your partner representative about:
+ - Retweets and shares from the official dbt Labs accounts.
+ - Flagging posts internally at dbt Labs to get individual employees to share.
+
+#### Measuring engagement
+
+You don’t need 1000 people in a channel to succeed, but you need at least a few active participants who can make it feel lived in. If you’re comfortable working in public, this could be members of your team, or it can be a few people who you know that are highly engaged and would be interested in participating. Having even 2 or 3 regulars hanging out in a channel is all that’s needed for a successful start and is, in fact, much more impactful than 250 people that never post.
+
+### How to announce a new adapter
+
+We’d recommend _against_ boilerplate announcements and encourage finding a unique voice. That being said, there are a couple of things that we’d want to include:
+
+- A summary of the value prop of your database / technology for users who aren’t familiar.
+- The personas that might be interested in this news.
+- A description of what the adapter _is_. For example:
+ > With the release of our new dbt adapter, you’ll be able to to use dbt to model and transform your data in [name-of-your-org]
+- Particular or unique use cases or functionality unlocked by the adapter.
+- Plans for future / ongoing support / development.
+- The link to the documentation for using the adapter on the dbt Labs docs site.
+- An announcement blog.
+
+#### Announcing new release versions of existing adapters
+
+This can vary substantially depending on the nature of the release but a good baseline is the types of release messages that [we put out in the #dbt-releases](https://getdbt.slack.com/archives/C37J8BQEL/p1651242161526509) channel.
+
+![Full Release Post](/img/adapter-guide/0-full-release-notes.png)
+
+Breaking this down:
+
+- Visually distinctive announcement - make it clear this is a release
+
+- Short written description of what is in the release
+
+- Links to additional resources
+
+- Implementation instructions:
+
+- Future plans
+
+- Contributor recognition (if applicable)
+
+
+
+## Verify a new adapter
+
+The very first data platform dbt supported was Redshift followed quickly by Postgres (([dbt-core#174](https://github.com/dbt-labs/dbt-core/pull/174)). In 2017, back when dbt Labs (née Fishtown Analytics) was still a data consultancy, we added support for Snowflake and BigQuery. We also turned dbt's database support into an adapter framework ([dbt-core#259](https://github.com/dbt-labs/dbt-core/pull/259/)), and a plugin system a few years later. For years, dbt Labs specialized in those four data platforms and became experts in them. However, the surface area of all possible databases, their respective nuances, and keeping them up-to-date and bug-free is a Herculean and/or Sisyphean task that couldn't be done by a single person or even a single team! Enter the dbt community which enables dbt Core to work on more than 30 different databases (32 as of Sep '22)!
+
+Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a _good thing_, however it requires due dilligence that wasn't required in a paid-license, closed-source software world. Before taking a dependency on an open-source projet is is important to determine the answer to the following questions:
+
+1. Does it work?
+2. Does it meet my team's specific use case?
+3. Does anyone "own" the code, or is anyone liable for ensuring it works?
+4. Do bugs get fixed quickly?
+5. Does it stay up-to-date with new Core features?
+6. Is the usage substantial enough to self-sustain?
+7. What risks do I take on by taking a dependency on this library?
+
+These are valid, important questions to answer—especially given that `dbt-core` itself only put out its first stable release (major version v1.0) in December 2021! Indeed, up until now, the majority of new user questions in database-specific channels are some form of:
+
+- "How mature is `dbt-`? Any gotchas I should be aware of before I start exploring?"
+- "has anyone here used `dbt-` for production models?"
+- "I've been playing with `dbt-` -- I was able to install and run my initial experiments. I noticed that there are certain features mentioned on the documentation that are marked as 'not ok' or 'not tested'. What are the risks?
+I'd love to make a statement on my team to adopt DBT [sic], but I'm pretty sure questions will be asked around the possible limitations of the adapter or if there are other companies out there using dbt [sic] with Oracle DB in production, etc."
+
+There has been a tendency to trust the dbt Labs-maintained adapters over community- and vendor-supported adapters, but repo ownership is only one among many indicators of software quality. We aim to help our users feel well-informed as to the caliber of an adapter with a new program.
+
+### Verified by dbt Labs
+
+The adapter verification program aims to quickly indicate to users which adapters can be trusted to use in production. Previously, doing so was uncharted territory for new users and complicated making the business case to their leadership team. We plan to give quality assurances by:
+
+1. appointing a key stakeholder for the adapter repository,
+2. ensuring that the chosen stakeholder fixes bugs and cuts new releases in a timely manner. Refer to the "Maintaining your new adapter" step for more information.
+3. demonstrating that it passes our adapter pytest suite tests,
+4. assuring that it works for us internally and ideally an existing team using the adapter in production .
+
+Every major & minor version of a adapter will be verified internally and given an official :white_check_mark: (custom emoji coming soon), on the ["Supported Data Platforms"](/docs/supported-data-platforms) page.
+
+### How to get an adapter verified?
+
+We envision that data platform vendors will be most interested in having their adapter versions verified, however we are open to community adapter verification. If interested, please reach out either to the `partnerships` at `dbtlabs.com` or post in the [#adapter-ecosystem Slack channel](https://getdbt.slack.com/archives/C030A0UF5LM).
+
+## Build a trusted adapter
+
+The Trusted adapter program exists to allow adapter maintainers to demonstrate to the dbt community that your adapter is trusted to be used in production.
+
+### What it means to be trusted
+
+By opting into the below, you agree to this, and we take you at your word. dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met.
+
+### Feature Completeness
+
+To be considered for the Trusted Adapter program, the adapter must cover the essential functionality of dbt Core given below, with best effort given to support the entire feature set.
+
+Essential functionality includes (but is not limited to the following features):
+
+- table, view, and seed materializations
+- dbt tests
+
+The adapter should have the required documentation for connecting and configuring the adapter. The dbt docs site should be the single source of truth for this information. These docs should be kept up-to-date.
+
+Proceed to the "Document a new adapter" step for more information.
+
+### Release Cadence
+
+Keeping an adapter up-to-date with dbt Core is an integral part of being a trusted adapter. Therefore, we ask that adapter maintainers:
+
+- Release of new minor versions of the adapter with all tests passing within four weeks of dbt Core's release cut.
+- Release of new major versions of the adapter with all tests passing within eight weeks of dbt Core's release cut.
+
+### Community Responsiveness
+
+On a best effort basis, active participation and engagement with the dbt Community across the following forums:
+
+- Being responsive to feedback and supporting user enablement in dbt Community’s Slack workspace
+- Responding with comments to issues raised in public dbt adapter code repository
+- Merging in code contributions from community members as deemed appropriate
+
+### Security Practices
+
+Trusted adapters will not do any of the following:
+
+- Output to logs or file either access credentials information to or data from the underlying data platform itself.
+- Make API calls other than those expressly required for using dbt features (adapters may not add additional logging)
+- Obfuscate code and/or functionality so as to avoid detection
+
+Additionally, to avoid supply-chain attacks:
+
+- Use an automated service to keep Python dependencies up-to-date (such as Dependabot or similar),
+- Publish directly to PyPI from the dbt adapter code repository by using trusted CI/CD process (such as GitHub actions)
+- Restrict admin access to both the respective code (GitHub) and package (PyPI) repositories
+- Identify and mitigate security vulnerabilities by use of a static code analyzing tool (such as Snyk) as part of a CI/CD process
+
+### Other considerations
+
+The adapter repository is:
+
+- open-souce licensed,
+- published to PyPI, and
+- automatically tests the codebase against dbt Lab's provided adapter test suite
+
+### How to get an adapter verified
+
+Open an issue on the [docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com) using the "Add adapter to Trusted list" template. In addition to contact information, it will ask confirm that you agree to the following.
+
+1. my adapter meet the guidelines given above
+2. I will make best reasonable effort that this continues to be so
+3. checkbox: I acknowledge that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the above guidelines not be met.
+
+The approval workflow is as follows:
+
+1. create and populate the template-created issue
+2. dbt Labs will respond as quickly as possible (maximally four weeks, though likely faster)
+3. If approved, dbt Labs will create and merge a Pull request to formally add the adapter to the list.
+
+### Getting help for my trusted adapter
+
+Ask your question in #adapter-ecosystem channel of the dbt community Slack.
diff --git a/website/docs/guides/airflow-and-dbt-cloud.md b/website/docs/guides/airflow-and-dbt-cloud.md
new file mode 100644
index 00000000000..a3ff59af14e
--- /dev/null
+++ b/website/docs/guides/airflow-and-dbt-cloud.md
@@ -0,0 +1,296 @@
+---
+title: Airflow and dbt Cloud
+id: airflow-and-dbt-cloud
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['dbt Cloud', 'Orchestration']
+level: 'Intermediate'
+recently_updated: true
+---
+
+## Introduction
+
+In some cases, [Airflow](https://airflow.apache.org/) may be the preferred orchestrator for your organization over working fully within dbt Cloud. There are a few reasons your team might be considering using Airflow to orchestrate your dbt jobs:
+
+- Your team is already using Airflow to orchestrate other processes
+- Your team needs to ensure that a [dbt job](https://docs.getdbt.com/docs/dbt-cloud/cloud-overview#schedule-and-run-dbt-jobs-in-production) kicks off before or after another process outside of dbt Cloud
+- Your team needs flexibility to manage more complex scheduling, such as kicking off one dbt job only after another has completed
+- Your team wants to own their own orchestration solution
+- You need code to work right now without starting from scratch
+
+### Prerequisites
+
+- [dbt Cloud Teams or Enterprise account](https://www.getdbt.com/pricing/) (with [admin access](https://docs.getdbt.com/docs/cloud/manage-access/enterprise-permissions)) in order to create a service token. Permissions for service tokens can be found [here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens).
+- A [free Docker account](https://hub.docker.com/signup) in order to sign in to Docker Desktop, which will be installed in the initial setup.
+- A local digital scratchpad for temporarily copy-pasting API keys and URLs
+
+### Airflow + dbt Core
+
+There are [so many great examples](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py) from GitLab through their open source data engineering work. This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/).
+
+### Airflow + dbt Cloud API w/Custom Scripts
+
+This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available [here](https://registry.astronomer.io/providers/dbt%20Cloud/versions/latest).
+
+There are many different permutations of this over time:
+
+- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on [custom python API utilities](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py)
+- [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want
+- For more options, check out the [official dbt Docs](/docs/deploy/deployments#airflow) on the various ways teams are running dbt in airflow
+
+These solutions are great, but can be difficult to trust as your team grows and management for things like: testing, job definitions, secrets, and pipelines increase past your team’s capacity. Roles become blurry (or were never clearly defined at the start!). Both data and analytics engineers start digging through custom logging within each other’s workflows to make heads or tails of where and what the issue really is. Not to mention that when the issue is found, it can be even harder to decide on the best path forward for safely implementing fixes. This complex workflow and unclear delineation on process management results in a lot of misunderstandings and wasted time just trying to get the process to work smoothly!
+
+
+In this guide, you'll learn how to:
+
+1. Creating a working local Airflow environment
+2. Invoking a dbt Cloud job with Airflow (with proof!)
+3. Reusing tested and trusted Airflow code for your specific use cases
+
+You’ll also gain a better understanding of how this will:
+
+- Reduce the cognitive load when building and maintaining pipelines
+- Avoid dependency hell (think: `pip install` conflicts)
+- Implement better recoveries from failures
+- Define clearer workflows so that data and analytics engineers work better, together ♥️
+
+
+🙌 Let’s get started! 🙌
+
+## Install the Astro CLI
+
+Astro is a managed software service that includes key features for teams working with Airflow. In order to use Astro, we’ll install the Astro CLI, which will give us access to useful commands for working with Airflow locally. You can read more about Astro [here](https://docs.astronomer.io/astro/).
+
+In this example, we’re using Homebrew to install Astro CLI. Follow the instructions to install the Astro CLI for your own operating system [here](https://docs.astronomer.io/astro/install-cli).
+
+```bash
+brew install astro
+```
+
+
+
+## Install and start Docker Desktop
+
+Docker allows us to spin up an environment with all the apps and dependencies we need for the example.
+
+Follow the instructions [here](https://docs.docker.com/desktop/) to install Docker desktop for your own operating system. Once Docker is installed, ensure you have it up and running for the next steps.
+
+
+
+## Clone the airflow-dbt-cloud repository
+
+Open your terminal and clone the [airflow-dbt-cloud repository](https://github.com/sungchun12/airflow-dbt-cloud.git). This contains example Airflow DAGs that you’ll use to orchestrate your dbt Cloud job. Once cloned, navigate into the `airflow-dbt-cloud` project.
+
+```bash
+git clone https://github.com/sungchun12/airflow-dbt-cloud.git
+cd airflow-dbt-cloud
+```
+
+
+
+## Start the Docker container
+
+You can initialize an Astronomer project in an empty local directory using a Docker container, and then run your project locally using the `start` command.
+
+1. Run the following commands to initialize your project and start your local Airflow deployment:
+
+ ```bash
+ astro dev init
+ astro dev start
+ ```
+
+ When this finishes, you should see a message similar to the following:
+
+ ```bash
+ Airflow is starting up! This might take a few minutes…
+
+ Project is running! All components are now available.
+
+ Airflow Webserver: http://localhost:8080
+ Postgres Database: localhost:5432/postgres
+ The default Airflow UI credentials are: admin:admin
+ The default Postrgres DB credentials are: postgres:postgres
+ ```
+
+2. Open the Airflow interface. Launch your web browser and navigate to the address for the **Airflow Webserver** from your output in Step 1.
+
+ This will take you to your local instance of Airflow. You’ll need to log in with the **default credentials**:
+
+ - Username: admin
+ - Password: admin
+
+ ![Airflow login screen](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-login.png)
+
+
+
+## Create a dbt Cloud service token
+
+Create a service token from within dbt Cloud using the instructions [found here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Ensure that you save a copy of the token, as you won’t be able to access this later. In this example we use `Account Admin`, but you can also use `Job Admin` instead for token permissions.
+
+
+
+## Create a dbt Cloud job
+
+In your dbt Cloud account create a job, paying special attention to the information in the bullets below. Additional information for creating a dbt Cloud job can be found [here](/guides/bigquery).
+
+- Configure the job with the commands that you want to include when this job kicks off, as Airflow will be referring to the job’s configurations for this rather than being explicitly coded in the Airflow DAG. This job will run a set of commands rather than a single command.
+- Ensure that the schedule is turned **off** since we’ll be using Airflow to kick things off.
+- Once you hit `save` on the job, make sure you copy the URL and save it for referencing later. The url will look similar to this:
+
+```html
+https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/
+```
+
+
+
+## Add your dbt Cloud API token as a secure connection
+
+
+
+Now you have all the working pieces to get up and running with Airflow + dbt Cloud. Let’s dive into make this all work together. We will **set up a connection** and **run a DAG in Airflow** that kicks off a dbt Cloud job.
+
+1. Navigate to Admin and click on **Connections**
+
+ ![Airflow connections menu](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-connections-menu.png)
+
+2. Click on the `+` sign to add a new connection, then click on the drop down to search for the dbt Cloud Connection Type
+
+ ![Create connection](/img/guides/orchestration/airflow-and-dbt-cloud/create-connection.png)
+
+ ![Connection type](/img/guides/orchestration/airflow-and-dbt-cloud/connection-type.png)
+
+3. Add in your connection details and your default dbt Cloud account id. This is found in your dbt Cloud URL after the accounts route section (`/accounts/{YOUR_ACCOUNT_ID}`), for example the account with id 16173 would see this in their URL: `https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/`
+
+![https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2](https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2)
+
+## Add your `job_id` and `account_id` config details to the python file
+
+ Add your `job_id` and `account_id` config details to the python file: [dbt_cloud_provider_eltml.py](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/dags/dbt_cloud_provider_eltml.py).
+
+1. You’ll find these details within the dbt Cloud job URL, see the comments in the code snippet below for an example.
+
+ ```python
+ # dbt Cloud Job URL: https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/
+ # account_id: 16173
+ #job_id: 65767
+
+ # line 28
+ default_args={"dbt_cloud_conn_id": "dbt_cloud", "account_id": 16173},
+
+ trigger_dbt_cloud_job_run = DbtCloudRunJobOperator(
+ task_id="trigger_dbt_cloud_job_run",
+ job_id=65767, # line 39
+ check_interval=10,
+ timeout=300,
+ )
+ ```
+
+2. Turn on the DAG and verify the job succeeded after running. Note: screenshots taken from different job runs, but the user experience is consistent.
+
+ ![https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy](https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy)
+
+ ![Airflow DAG](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-dag.png)
+
+ ![Task run instance](/img/guides/orchestration/airflow-and-dbt-cloud/task-run-instance.png)
+
+ ![https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS](https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS)
+
+## How do I rerun the dbt Cloud job and downstream tasks in my pipeline?
+
+If you have worked with dbt Cloud before, you have likely encountered cases where a job fails. In those cases, you have likely logged into dbt Cloud, investigated the error, and then manually restarted the job.
+
+This section of the guide will show you how to restart the job directly from Airflow. This will specifically run *just* the `trigger_dbt_cloud_job_run` and downstream tasks of the Airflow DAG and not the entire DAG. If only the transformation step fails, you don’t need to re-run the extract and load processes. Let’s jump into how to do that in Airflow.
+
+1. Click on the task
+
+ ![Task DAG view](/img/guides/orchestration/airflow-and-dbt-cloud/task-dag-view.png)
+
+2. Clear the task instance
+
+ ![Clear task instance](/img/guides/orchestration/airflow-and-dbt-cloud/clear-task-instance.png)
+
+ ![Approve clearing](/img/guides/orchestration/airflow-and-dbt-cloud/approve-clearing.png)
+
+3. Watch it rerun in real time
+
+ ![Re-run](/img/guides/orchestration/airflow-and-dbt-cloud/re-run.png)
+
+## Cleaning up
+
+At the end of this guide, make sure you shut down your docker container. When you’re done using Airflow, use the following command to stop the container:
+
+```bash
+$ astrocloud dev stop
+
+[+] Running 3/3
+ ⠿ Container airflow-dbt-cloud_e3fe3c-webserver-1 Stopped 7.5s
+ ⠿ Container airflow-dbt-cloud_e3fe3c-scheduler-1 Stopped 3.3s
+ ⠿ Container airflow-dbt-cloud_e3fe3c-postgres-1 Stopped 0.3s
+```
+
+To verify that the deployment has stopped, use the following command:
+
+```bash
+astrocloud dev ps
+```
+
+This should give you an output like this:
+
+```bash
+Name State Ports
+airflow-dbt-cloud_e3fe3c-webserver-1 exited
+airflow-dbt-cloud_e3fe3c-scheduler-1 exited
+airflow-dbt-cloud_e3fe3c-postgres-1 exited
+```
+
+
+
+## Frequently asked questions
+
+### How can we run specific subsections of the dbt DAG in Airflow?
+
+Because of the way we configured the dbt Cloud job to run in Airflow, you can leave this job to your analytics engineers to define in the job configurations from dbt Cloud. If, for example, we need to run hourly-tagged models every hour and daily-tagged models daily, we can create jobs like `Hourly Run` or `Daily Run` and utilize the commands `dbt run -s tag:hourly` and `dbt run -s tag:daily` within each, respectively. We only need to grab our dbt Cloud `account` and `job id`, configure it in an Airflow DAG with the code provided, and then we can be on your way. See more node selection options: [here](/reference/node-selection/syntax)
+
+### How can I re-run models from the point of failure?
+
+You may want to parse the dbt DAG in Airflow to get the benefit of re-running from the point of failure. However, when you have hundreds of models in your DAG expanded out, it becomes useless for diagnosis and rerunning due to the overhead that comes along with creating an expansive Airflow DAG.
+
+You can’t re-run from failure natively in dbt Cloud today (feature coming!), but you can use a custom rerun parser.
+
+Using a simple python script coupled with the dbt Cloud provider, you can:
+
+- Avoid managing artifacts in a separate storage bucket(dbt Cloud does this for you)
+- Avoid building your own parsing logic
+- Get clear logs on what models you're rerunning in dbt Cloud (without hard coding step override commands)
+
+Watch the video below to see how it works!
+
+
+
+### Should Airflow run one big dbt job or many dbt jobs?
+
+Overall we recommend being as purposeful and minimalistic as you can. This is because dbt manages all of the dependencies between models and the orchestration of running those dependencies in order, which in turn has benefits in terms of warehouse processing efforts.
+
+### We want to kick off our dbt jobs after our ingestion tool (such as Fivetran) / data pipelines are done loading data. Any best practices around that?
+
+Our friends at Astronomer answer this question with this example: [here](https://registry.astronomer.io/dags/fivetran-dbt-cloud-census)
+
+### How do you set up a CI/CD workflow with Airflow?
+
+Check out these two resources for accomplishing your own CI/CD pipeline:
+
+- [Continuous Integration with dbt Cloud](/docs/deploy/continuous-integration)
+- [Astronomer's CI/CD Example](https://docs.astronomer.io/software/ci-cd/#example-cicd-workflow)
+
+### Can dbt dynamically create tasks in the DAG like Airflow can?
+
+We prefer to keep models bundled vs. unbundled. You can go this route, but if you have hundreds of dbt models, it’s more effective to let the dbt Cloud job handle the models and dependencies. Bundling provides the solution to clear observability when things go wrong - we've seen more success in having the ability to clearly see issues in a bundled dbt Cloud job than combing through the nodes of an expansive Airflow DAG. If you still have a use case for this level of control though, our friends at Astronomer answer this question [here](https://www.astronomer.io/blog/airflow-dbt-1/)!
+
+### Can you trigger notifications if a dbt job fails with Airflow? Is there any way to access the status of the dbt Job to do that?
+
+Yes, either through [Airflow's email/slack](https://www.astronomer.io/guides/error-notifications-in-airflow/) functionality by itself or combined with [dbt Cloud's notifications](/docs/deploy/job-notifications), which support email and slack notifications.
+
+### Are there decision criteria for how to best work with dbt Cloud and airflow?
+
+Check out this deep dive into planning your dbt Cloud + Airflow implementation [here](https://www.youtube.com/watch?v=n7IIThR8hGk)!
diff --git a/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md b/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md
deleted file mode 100644
index 844c895af98..00000000000
--- a/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md
+++ /dev/null
@@ -1,67 +0,0 @@
----
-title: "dbt Cloud environment best practices"
-id: 1-env-guide-overview
-description: Learn how to configure environments in dbt Cloud.
-displayText: "dbt Cloud environment best practices"
-hoverSnippet: Learn how to configure environments in dbt Cloud.
----
-
-> *How do I manage environments in my dbt Cloud project? How many do I need?*
->
-> *How does my structure map to environments in dbt Cloud?*
->
-> *What do git branches have to do with my dbt Cloud environments?*
->
-
-If these questions keep you up at night, you’ve come to the right place! When it comes to managing your dbt Cloud environments, there is not a one-size-fits-all solution for all teams. In this guide we’ll walk you through a few environment architecture options for dbt Cloud that we’d recommend, and hopefully you find an option that works for you.
-
-## Learning goals
-
-This guide has three main goals:
-
-- Provide our recommendations on managing dbt Cloud environments
-- Illustrate these recommendations with comprehensive examples
-- At each stage, explain *why* we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs
-
-:::info
-☁️ This guide focuses on architecture for **dbt Cloud**. However, similar principles apply for developers using dbt Core. Before diving into this guide we recommend taking a look at our **[dbt Cloud environments](/docs/dbt-cloud-environments)** page for more context.
-
-:::
-
-### How many environments do I really need?
-
-Environments define the way that dbt will execute your code, including:
-
-- The **version of dbt** that will run.
-- The **version of your code** to be executed.
-- The **connection information** for your warehouse.
-- In dbt Cloud, there are **two types of environments:**
- - **Development** — the environment settings in which you work in the IDE on a development branch.
- - **Deployment** — the environment settings in which a dbt Cloud job runs.
-
-In this guide, we’re going to focus on **deployment environments**, which determine how your project is executed when a **dbt Cloud job executes**. When using both approaches, make sure to designate one environment as "Production." This will allow you to use features such as dbt Explorer and cross-project references. Refer to [Set product environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) for details.
-
-Depending on your git workflow and testing strategy, you'll be choosing between one deployment environment or many deployment environments. We provide a high-level overview of how these two deployment strategies work here, but use each section of this guide to get a deep-dive into how these setups differ.
-
-| Setup option | Works well if you | Relative complexity level |
-| --- | --- | --- |
-| One deployment environment | - only scheduled runs for one set of data objects
- development branches are merged directly to main | Low |
-| Many deployment environments | - feature branches move through several promotion stages | High |
-
-### TL;DR — One deployment environment
-
-We usually recommended folks start with the basics; having one deployment environment is usually the simplest and most maintainable approach to start. This approach works well if:
-
-- You only need to have **scheduled jobs running in a single environment** within your data warehouse.
-- You use a **single primary branch** and follow a direct promotion (**Dev —> Prod**) strategy
-
-With this option, your production jobs and your [Slim CI jobs](/docs/deploy/continuous-integration) that ensure code integrity are managed within one single deployment environment.
-
-### TL;DR — Many deployment environments
-This approach adds a bit more complexity and may slow down the development process, but adds a layer of security that can be worth the tradeoff. This approach works well if:
-
-- Your organization maintains **several long-lived git branches** to control how and when changes are tested and promoted to production.
- - Some orgs follow a **Dev —> QA —> Prod release cycle** — if that sounds like your org, this approach is probably right for you.
-- The **output of your dbt project is an input to other systems** and you need to test and validate many changes on a stable, long-lived staging dataset in a pre-production environment.
-
-The two options are explored in more detail in the following sections, including the benefits, trade-offs, the steps required to implement the setup in dbt Cloud.
diff --git a/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md b/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md
deleted file mode 100644
index 89bb05e7c75..00000000000
--- a/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md
+++ /dev/null
@@ -1,61 +0,0 @@
----
-title: "One deployment environment"
-id: 2-one-deployment-environment
-description: Learn how to configure a single deployment environment setup in dbt Cloud.
-displayText: "dbt Cloud environment best practices"
-hoverSnippet: Learn how to configure a single deployment environment setup in dbt Cloud.
----
-import ExpNote from '/snippets/_explorer-beta-note.md';
-
-
-## What this looks like
-
-1. You have a **single *development* environment** where dbt users can access the dbt Cloud IDE and make changes to their code on feature branches created off of your default branch in your repository (most often the `main` branch).
-2. You have a **single *deployment* environment** (let’s call it “Production”) where your scheduled jobs run referencing the `main` branch.
-
-
-
-3. You also have a [**Slim CI job**](/docs/deploy/continuous-integration) that kicks off anytime you open a PR to merge a feature branch into `main`. This Slim CI job can run in your dbt “Production” environment.
-
-:::info
-
-☁️ Slim CI jobs run in a dedicated custom schema for each PR, so there will no collision with your production schemas.
-
-:::
-
-
-
-### Git workflow
-
-
-
-
-1. In the dbt Cloud IDE, developers work on feature branches, created from the `main` branch (`feature_a`, `feature_b`, `feature_c` above)
-2. When code is ready, developer opens a PR to merge feature branch into `main`
-3. [**Slim CI Job**](/docs/deploy/continuous-integration) automatically kicks off, and tests the changes made in the PR
-4. When Slim CI Job is successful and team is ready to deploy changes to Production, the PR is merged directly into the `main` branch. The next time a production job runs, these changes will be incorporated and executed.
-
-### dbt Cloud setup
-
-1. Create your [**development environment**](/docs/dbt-cloud-environments) to power the dbt Cloud IDE. No extra customization needed!
-2. Create your **[production deployment environment](/docs/deploy/deploy-environments)**.
-3. Define your **dbt Cloud jobs** in the production deployment environment from step 2.
- 1. **Production job(s)**: You will need to set up **at least one scheduled job** that deploys your project to your production databases/schemas. You may create multiple jobs based on your business SLAs.
- 2. **Slim CI Job**: Unlike the production jobs, which are triggered via the scheduler, this job will be triggered when PRs are opened in your repository. Refer to [Slim CI jobs](/docs/deploy/slim-ci-jobs) for details.
-
-
-### When this works well
-
-This approach is recommended for most use cases because it enables you to quickly and safely implement code changes in the production environment. It also gives developers the confidence to trust and rely on these changes. With this option, multiple developers can easily contribute to and collaborate on the same codebase with confidence.
-
-:::info
-💡 Check out [Sunrun's Coalesce 2022 talk](https://www.youtube.com/watch?v=vmBAO2XN-fM) on Automating CI/CD in dbt Cloud, where they simplified their CI/CD process from several long-lived branches to a single long-lived main branch with feature branches.
-
-:::
-
-### When this doesn’t work so well
-
-- You have a **formal QA process** before merging code into production.
-- You want to **control when features are released** to production.
-- You need to have scheduled **jobs running in many environments** due to dependencies on outside systems.
- - e.g. Your organization has many applications that consume and test data changes in a lower non-Production environment before changes should be promoted to Production.
diff --git a/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md b/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md
deleted file mode 100644
index cb882d4ac1b..00000000000
--- a/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md
+++ /dev/null
@@ -1,77 +0,0 @@
----
-title: "Many deployment environments"
-id: 3-many-deployment-environments
-description: Learn how to configure a many deployment environment setup in dbt Cloud.
-displayText: "dbt Cloud environment best practices"
-hoverSnippet: Learn how to configure a many deployment environment setup in dbt Cloud.
----
-import ExpNote from '/snippets/_explorer-beta-note.md';
-
-## What this looks like
-
-1. You have a **single *development* environment** where dbt users can access the dbt Cloud IDE and make changes to their code. However, you’ll want to update the **[custom branch settings](faqs/Environments/custom-branch-settings)** to ensure that developers create feature branches off of the a non-production branch. For this example, we’ll refer to this as the `qa` branch.
-2. You have a **QA deployment environment**, running scheduled jobs from the `qa` branch that deploys your dbt project to a pre-production warehouse location.
-3. You have a **Production deployment environment,** running scheduled jobs from the `main` branch that deploys your dbt project to your production warehouse location.
-
-
-
-4. You have **multiple Slim CI jobs** (one in each deployment environment) to ensure changes to each branch are tested.
-
-
-
-### Git workflow
-
-
-
-1. In the dbt Cloud IDE, developers work on feature branches, **created from the `qa` branch** (`feature_a`, `feature_b`, `feature_c` above).
-2. When code is ready, developer opens a PR to merge feature branch into `qa`.
-3. The **first Slim CI Job** automatically kicks off to test the changes introduced in the PR. This job will *defer to a regularly-scheduled job in the QA environment* and run in the QA deployment environment.
-4. When **Slim CI Job is successful** and team is ready to deploy changes, the **PR is merged into `qa`.**
-5. Scheduled jobs run in the QA deployment environment, running on `qa` branch to ensure the new changes work as intended.
-6. When **all feature branches** for a given release (e.g. sprint) have been **successfully merged** to `qa` and are **running without error** in the QA deployment environment, a team member opens a **PR to merge `qa` → `main`.**
-7. The **second Slim CI Job** automatically kicks off to test changes in PR. This job will *defer to a regularly-scheduled job in the Production environment* and run in the Production deployment environment.
-8. When **second Slim CI Job** is successful and team is ready to deploy changes, the **PR is merged into `main`**.
-9. Monitor scheduled jobs in the Production deployment environment that are running on `main` branch. Voila! All changes are released and ready for your stakeholders.
-
-:::info
-💡 Considering a different branching strategy that involves cherry picking? [Maybe reconsider!](https://docs.getdbt.com/blog/the-case-against-git-cherry-picking)
-
-:::
-
-### dbt Cloud setup
-
-1. Create your [**development environment**](/docs/dbt-cloud-environments) to power the dbt Cloud IDE.
-
- Here, we’ll set a **custom branch** so that users in the IDE create their feature branches from `qa` instead of `main`. Click **Only run on a custom branch** in **General settings**, enter `qa` into **Custom Branch.**
-
-2. Set up your **QA [deployment environment](/docs/deploy/deploy-environments)**
-
- Here, we’ll apply the same custom branch settings as the development environment in Step 1. All scheduled jobs in the QA deployment environment will use the code from the `qa` branch during execution.
-
-3. **Define QA jobs**
- 1. **QA job(s)**: You’ll want to create at least one scheduled job, running on a roughly daily cadence. This will allow us to make sure all the code executes without error before you release it to production, and will also power the first Slim CI job.
- 2. **Slim CI Job**: As above, this job will be triggered when PRs are opened in your repository. Enable this option by selecting **Run on Pull Requests?** under the **Continuous Integration(CI)** tab under the **Triggers** section. Since we’re using the custom branch setting in the QA environment, you'll also want to be sure to select the second option **Run only on Custom Branch** (selected by default) — this means that only PRs created against the `qa` branch will trigger this job, rather than any PR at all.
-
- This job will also need to defer to one of the QA jobs created in step 3a. This enables the use of the `state` modifier in your selection syntax to only run changes introduced by your PR.
-
-4. Set up your **Production [deployment environment](/docs/deploy/deploy-environments)**
-
- Here, we’ll *also* use the same custom branch settings as the other environments, but set the custom branch as `main`. Even thought the `main` branch is the default, setting this value enables us to properly set up the CI Job in the next step.
-
-5. **Define production jobs**
- 1. **Production job(s)**: You will need to set up at least one scheduled job that deploys your project to your production databases/schemas. You may create multiple jobs based on your business SLAs.
- 2. **Production Slim CI Job**: As above, this job will be triggered when PRs are opened in your repository. Enable this option by selecting **Run on Pull Requests?** under the **Continuous Integration(CI)** tab under the **Triggers** section. Since we’re using the custom branch setting in the QA environment, we’ll also want to select the second option **Run only on Custom Branch** — this means that only PRs created against the `main` branch will trigger this job, rather than any PR at all.
-
- This job will also need to defer to one of the QA jobs created in step 5a. This enables the use of the `state` modifier in your selection syntax to only run changes introduced by your PR.
-
-### When this works well
-
-This approach works well when it’s critical to **apply user acceptance and integration testing to your project in a pre-production environment**. This approach allows you to have scheduled jobs running in **many environments** on your data warehouse.
-
-### When this doesn’t work so well
-
-This approach may slow down the time it takes to get new feature into production, since it requires additional steps in the deployment process and additional branches to maintain. Keep in mind that adding complexity to your deployment process might cause some slowdown in your release cycle.
-
-## Conclusion
-
-While there’s no single correct answer to how to setup your dbt Cloud environments, they are flexible enough to enable just about any code promotion workflow your organization uses. We would love to hear how you’ve set up your deployment infrastructure in dbt Cloud!
diff --git a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md b/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md
deleted file mode 100644
index 22f8e36190a..00000000000
--- a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md
+++ /dev/null
@@ -1,12 +0,0 @@
----
-title: Now it's your turn
-id: 6-how-we-style-conclusion
----
-
-## BYO Styles
-
-Now that you've seen how we style our dbt projects, it's time to build your own. Feel free to copy this guide and use it as a template for your own project. If you do, we'd love to hear about it! Reach out to us on [the Community Forum](https://discourse.getdbt.com/c/show-and-tell/22) or [Slack](https://www.getdbt.com/community) to share your style guide. We recommend co-locating your style guide with your code to make sure contributors can easily follow it. If you're using GitHub, you can add your style guide to your repository's wiki, or include it in your README.
-
-## Pre-commit hooks
-
-Lastly, to ensure your style guide's automated rules are being followed without additional mental overhead to your team, you can use [pre-commit hooks](https://pre-commit.com/) to automatically check your code for style violations (and often fix them automagically) before it's committed. This is a great way to make sure your style guide is followed by all contributors. We recommend implementing this once you've settled on and published your style guide, and your codebase is conforming to it. This will ensure that all future commits follow the style guide. You can find an excellent set of open source pre-commit hooks for dbt from the community [here in the dbt-checkpoint project](https://github.com/dbt-checkpoint/dbt-checkpoint).
diff --git a/website/docs/quickstarts/bigquery-qs.md b/website/docs/guides/bigquery-qs.md
similarity index 98%
rename from website/docs/quickstarts/bigquery-qs.md
rename to website/docs/guides/bigquery-qs.md
index 84e3b3ae545..c1f632f0621 100644
--- a/website/docs/quickstarts/bigquery-qs.md
+++ b/website/docs/guides/bigquery-qs.md
@@ -1,10 +1,12 @@
---
title: "Quickstart for dbt Cloud and BigQuery"
id: "bigquery"
-time_to_complete: '30 minutes'
-platform: 'dbt-cloud'
+# time_to_complete: '30 minutes' commenting out until we test
+level: 'Beginner'
icon: 'bigquery'
hide_table_of_contents: true
+tags: ['BigQuery', 'dbt Cloud','Quickstart']
+recently_updated: true
---
## Introduction
@@ -33,8 +35,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen
### Related content
- Learn more with [dbt Courses](https://courses.getdbt.com/collections)
-- [dbt Cloud CI job](/docs/deploy/continuous-integration)
-- [Job triggers](/docs/deploy/job-triggers)
+- [CI jobs](/docs/deploy/continuous-integration)
+- [Deploy jobs](/docs/deploy/deploy-jobs)
- [Job notifications](/docs/deploy/job-notifications)
- [Source freshness](/docs/deploy/source-freshness)
@@ -73,7 +75,6 @@ In order to let dbt connect to your warehouse, you'll need to generate a keyfile
1. Start the [GCP credentials wizard](https://console.cloud.google.com/apis/credentials/wizard). Make sure your new project is selected in the header. If you do not see your account or project, click your profile picture to the right and verify you are using the correct email account. For **Credential Type**:
- From the **Select an API** dropdown, choose **BigQuery API**
- Select **Application data** for the type of data you will be accessing
- - Select **No, I’m not using them** and click **Next**.
- Click **Next** to create a new service account.
2. Create a service account for your new project from the [Service accounts page](https://console.cloud.google.com/projectselector2/iam-admin/serviceaccounts?supportedpurview=project). For more information, refer to [Create a service account](https://developers.google.com/workspace/guides/create-credentials#create_a_service_account) in the Google Cloud docs. As an example for this guide, you can:
- Type `dbt-user` as the **Service account name**
diff --git a/website/docs/guides/legacy/building-packages.md b/website/docs/guides/building-packages.md
similarity index 88%
rename from website/docs/guides/legacy/building-packages.md
rename to website/docs/guides/building-packages.md
index 2a6803334d4..641a1c6af6d 100644
--- a/website/docs/guides/legacy/building-packages.md
+++ b/website/docs/guides/building-packages.md
@@ -1,26 +1,38 @@
---
-title: "Building a dbt package" # to do: update this to creating
-id: "building-packages"
+title: Building dbt packages
+id: building-packages
+description: "When you have dbt code that might help others, you can create a package for dbt using a GitHub repository."
+displayText: Building dbt packages
+hoverSnippet: Learn how to create packages for dbt.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['dbt Core']
+level: 'Advanced'
+recently_updated: true
---
-## Assumed knowledge
-This article assumes you are familiar with:
+## Introduction
+
+Creating packages is an **advanced use of dbt**. If you're new to the tool, we recommend that you first use the product for your own analytics before attempting to create a package for others.
+
+### Prerequisites
+
+A strong understanding of:
- [packages](/docs/build/packages)
- administering a repository on GitHub
- [semantic versioning](https://semver.org/)
-Heads up — developing a package is an **advanced use of dbt**. If you're new to the tool, we recommend that you first use the product for your own company's analytics before attempting to create a package.
-
-## 1. Assess whether a package is the right solution
+### Assess whether a package is the right solution
Packages typically contain either:
- macros that solve a particular analytics engineering problem — for example, [auditing the results of a query](https://hub.getdbt.com/dbt-labs/audit_helper/latest/), [generating code](https://hub.getdbt.com/dbt-labs/codegen/latest/), or [adding additional schema tests to a dbt project](https://hub.getdbt.com/calogica/dbt_expectations/latest/).
- models for a common dataset — for example a dataset for software products like [MailChimp](https://hub.getdbt.com/fivetran/mailchimp/latest/) or [Snowplow](https://hub.getdbt.com/dbt-labs/snowplow/latest/), or even models for metadata about your data stack like [Snowflake query spend](https://hub.getdbt.com/gitlabhq/snowflake_spend/latest/) and [the artifacts produced by `dbt run`](https://hub.getdbt.com/tailsdotcom/dbt_artifacts/latest/). In general, there should be a shared set of industry-standard metrics that you can model (e.g. email open rate).
Packages are _not_ a good fit for sharing models that contain business-specific logic, for example, writing code for marketing attribution, or monthly recurring revenue. Instead, consider sharing a blog post and a link to a sample repo, rather than bundling this code as a package (here's our blog post on [marketing attribution](https://blog.getdbt.com/modeling-marketing-attribution/) as an example).
-## 2. Create your new project
-:::note Using the CLI for package development
-We tend to use the CLI for package development. The development workflow often involves installing a local copy of your package in another dbt project — at present dbt Cloud is not designed for this workflow.
+## Create your new project
+:::note Using the command line for package development
+We tend to use the command line interface for package development. The development workflow often involves installing a local copy of your package in another dbt project — at present dbt Cloud is not designed for this workflow.
:::
1. Use the [dbt init](/reference/commands/init) command to create a new dbt project, which will be your package:
@@ -33,15 +45,15 @@ $ dbt init [package_name]
¹Currently, our package registry only supports packages that are hosted in GitHub.
-## 3. Develop your package
+## Develop your package
We recommend that first-time package authors first develop macros and models for use in their own dbt project. Once your new package is created, you can get to work on moving them across, implementing some additional package-specific design patterns along the way.
When working on your package, we often find it useful to install a local copy of the package in another dbt project — this workflow is described [here](https://discourse.getdbt.com/t/contributing-to-an-external-dbt-package/657).
-### Follow our best practices
+### Follow best practices
_Modeling packages only_
-Use our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md), our article on [how we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), and our [best practices](best-practices) for all of our advice on how to build your dbt project.
+Use our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md), our article on [how we structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview), and our [best practices](best-practices) for all of our advice on how to build your dbt project.
This is where it comes in especially handy to have worked on your own dbt project previously.
@@ -103,7 +115,7 @@ Over time, we've developed a set of useful GitHub artifacts that make administer
- Descriptions of the main models included in the package ([example](https://github.com/dbt-labs/snowplow))
- GitHub templates, including PR templates and issue templates ([example](https://github.com/dbt-labs/dbt-audit-helper/tree/master/.github))
-## 4. Add integration tests
+## Add integration tests
_Optional_
We recommend that you implement integration tests to confirm that the package works as expected — this is an even _more_ advanced step, so you may find that you build up to this.
@@ -125,7 +137,7 @@ packages:
-4. Add resources to the package (seeds, models, tests) so that you can successfully run your project, and compare the output with what you expect. The exact appraoch here will vary depending on your packages. In general you will find that you need to:
+4. Add resources to the package (seeds, models, tests) so that you can successfully run your project, and compare the output with what you expect. The exact approach here will vary depending on your packages. In general you will find that you need to:
- Add mock data via a [seed](/docs/build/seeds) with a few sample (anonymized) records. Configure the `integration_tests` project to point to the seeds instead of raw data tables.
- Add more seeds that represent the expected output of your models, and use the [dbt_utils.equality](https://github.com/dbt-labs/dbt-utils#equality-source) test to confirm the output of your package, and the expected output matches.
@@ -134,7 +146,7 @@ packages:
5. (Optional) Use a CI tool, like CircleCI or GitHub Actions, to automate running your dbt project when you open a new Pull Request. For inspiration, check out one of our [CircleCI configs](https://github.com/dbt-labs/snowplow/blob/main/.circleci/config.yml), which runs tests against our four main warehouses. Note: this is an advanced step — if you are going down this path, you may find it useful to say hi on [dbt Slack](https://community.getdbt.com/).
-## 5. Deploy the docs for your package
+## Deploy the docs for your package
_Optional_
A dbt docs site can help a prospective user of your package understand the code you've written. As such, we recommend that you deploy the site generated by `dbt docs generate` and link to the deployed site from your package.
@@ -147,12 +159,13 @@ The easiest way we've found to do this is to use [GitHub Pages](https://pages.gi
4. Enable GitHub pages on the repo in the settings tab, and point it to the “docs” subdirectory
4. GitHub should then deploy the docs at `.github.io/`, like so: [fivetran.github.io/dbt_ad_reporting](https://fivetran.github.io/dbt_ad_reporting/)
-## 6. Release your package
+## Release your package
Create a new [release](https://docs.github.com/en/github/administering-a-repository/managing-releases-in-a-repository) once you are ready for others to use your work! Be sure to use [semantic versioning](https://semver.org/) when naming your release.
In particular, if new changes will cause errors for users of earlier versions of the package, be sure to use _at least_ a minor release (e.g. go from `0.1.1` to `0.2.0`).
The release notes should contain an overview of the changes introduced in the new version. Be sure to call out any changes that break the existing interface!
-## 7. Add the package to hub.getdbt.com
+## Add the package to hub.getdbt.com
+
Our package registry, [hub.getdbt.com](https://hub.getdbt.com/), gets updated by the [hubcap script](https://github.com/dbt-labs/hubcap). To add your package to hub.getdbt.com, create a PR on the [hubcap repository](https://github.com/dbt-labs/hubcap) to include it in the `hub.json` file.
diff --git a/website/docs/quickstarts/codespace-qs.md b/website/docs/guides/codespace-qs.md
similarity index 93%
rename from website/docs/quickstarts/codespace-qs.md
rename to website/docs/guides/codespace-qs.md
index 3cd048c97a4..7712ed8f8e8 100644
--- a/website/docs/quickstarts/codespace-qs.md
+++ b/website/docs/guides/codespace-qs.md
@@ -1,9 +1,11 @@
---
-title: "Quickstart for dbt Core using GitHub Codespaces"
+title: Quickstart for dbt Core using GitHub Codespaces
id: codespace
platform: 'dbt-core'
icon: 'fa-github'
+level: 'Beginner'
hide_table_of_contents: true
+tags: ['dbt Core','Quickstart']
---
## Introduction
@@ -19,10 +21,10 @@ dbt Labs provides a [GitHub Codespace](https://docs.github.com/en/codespaces/ove
## Related content
-- [Create a GitHub repository](/quickstarts/manual-install?step=2)
-- [Build your first models](/quickstarts/manual-install?step=3)
-- [Test and document your project](/quickstarts/manual-install?step=4)
-- [Schedule a job](/quickstarts/manual-install?step=5)
+- [Create a GitHub repository](/guides/manual-install?step=2)
+- [Build your first models](/guides/manual-install?step=3)
+- [Test and document your project](/guides/manual-install?step=4)
+- [Schedule a job](/guides/manual-install?step=5)
- Learn more with [dbt Courses](https://courses.getdbt.com/collections)
## Create a codespace
diff --git a/website/docs/guides/advanced/creating-new-materializations.md b/website/docs/guides/create-new-materializations.md
similarity index 95%
rename from website/docs/guides/advanced/creating-new-materializations.md
rename to website/docs/guides/create-new-materializations.md
index d3081ea8e20..1ad7d202de6 100644
--- a/website/docs/guides/advanced/creating-new-materializations.md
+++ b/website/docs/guides/create-new-materializations.md
@@ -1,12 +1,18 @@
---
-title: "Creating new materializations"
-id: "creating-new-materializations"
+title: "Create new materializations"
+id: create-new-materializations
description: Learn how to create your own materializations.
displayText: Creating new materializations
hoverSnippet: Learn how to create your own materializations.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['dbt Core']
+level: 'Advanced'
+recently_updated: true
---
-## Overview
+## Introduction
The model materializations you're familiar with, `table`, `view`, and `incremental` are implemented as macros in a package that's distributed along with dbt. You can check out the [source code for these materializations](https://github.com/dbt-labs/dbt-core/tree/main/core/dbt/include/global_project/macros/materializations). If you need to create your own materializations, reading these files is a good place to start. Continue reading below for a deep-dive into dbt materializations.
@@ -110,13 +116,6 @@ Be sure to `commit` the transaction in the `cleanup` phase of the materializatio
### Update the Relation cache
-
-:::info New in 0.15.0
-
-The ability to synchronize the Relation cache is new in dbt v0.15.0
-
-:::
-
Materializations should [return](/reference/dbt-jinja-functions/return) the list of Relations that they have created at the end of execution. dbt will use this list of Relations to update the relation cache in order to reduce the number of queries executed against the database's `information_schema`. If a list of Relations is not returned, then dbt will raise a Deprecation Warning and infer the created relation from the model's configured database, schema, and alias.
@@ -172,13 +171,6 @@ For more information on the `config` dbt Jinja function, see the [config](/refer
## Materialization precedence
-
-:::info New in 0.15.1
-
-The materialization resolution order was poorly defined in versions of dbt prior to 0.15.1. Please use this guide for versions of dbt greater than or equal to 0.15.1.
-
-:::
-
dbt will pick the materialization macro in the following order (lower takes priority):
1. global project - default
diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md b/website/docs/guides/custom-cicd-pipelines.md
similarity index 57%
rename from website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md
rename to website/docs/guides/custom-cicd-pipelines.md
index d618f9eec64..672c6e6dab8 100644
--- a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md
+++ b/website/docs/guides/custom-cicd-pipelines.md
@@ -1,12 +1,63 @@
---
-title: Run a dbt Cloud job on merge
-id: 3-dbt-cloud-job-on-merge
+title: Customizing CI/CD with custom pipelines
+id: custom-cicd-pipelines
+description: "Learn the benefits of version-controlled analytics code and custom pipelines in dbt for enhanced code testing and workflow automation during the development process."
+displayText: Learn version-controlled code, custom pipelines, and enhanced code testing.
+hoverSnippet: Learn version-controlled code, custom pipelines, and enhanced code testing.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['dbt Cloud', 'Orchestration', 'CI']
+level: 'Intermediate'
+recently_updated: true
---
-This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts presented here can be generalized and used in whatever way best suits your use case.
+## Introduction
+
+One of the core tenets of dbt is that analytic code should be version controlled. This provides a ton of benefit to your organization in terms of collaboration, code consistency, stability, and the ability to roll back to a prior version. There’s an additional benefit that is provided with your code hosting platform that is often overlooked or underutilized. Some of you may have experience using dbt Cloud’s [webhook functionality](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) to run a job when a PR is created. This is a fantastic capability, and meets most use cases for testing your code before merging to production. However, there are circumstances when an organization needs additional functionality, like running workflows on every commit (linting), or running workflows after a merge is complete. In this article, we will show you how to setup custom pipelines to lint your project and trigger a dbt Cloud job via the API.
+
+A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch.
+
+
+### What are pipelines?
+
+Pipelines (which are known by many names, such as workflows, actions, or build steps) are a series of pre-defined jobs that are triggered by specific events in your repository (PR created, commit pushed, branch merged, etc). Those jobs can do pretty much anything your heart desires assuming you have the proper security access and coding chops.
+
+Jobs are executed on [runners](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#runners), which are virtual servers. The runners come pre-configured with Ubuntu Linux, macOS, or Windows. That means the commands you execute are determined by the operating system of your runner. You’ll see how this comes into play later in the setup, but for now just remember that your code is executed on virtual servers that are, typically, hosted by the code hosting platform.
+
+![Diagram of how pipelines work](/img/guides/orchestration/custom-cicd-pipelines/pipeline-diagram.png)
+
+Please note, runners hosted by your code hosting platform provide a certain amount of free time. After that, billing charges may apply depending on how your account is setup. You also have the ability to host your own runners. That is beyond the scope of this article, but checkout the links below for more information if you’re interested in setting that up:
+
+- Repo-hosted runner billing information:
+ - [GitHub](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions)
+ - [GitLab](https://docs.gitlab.com/ee/ci/pipelines/cicd_minutes.html)
+ - [Bitbucket](https://bitbucket.org/product/features/pipelines#)
+- Self-hosted runner information:
+ - [GitHub](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners)
+ - [GitLab](https://docs.gitlab.com/runner/)
+ - [Bitbucket](https://support.atlassian.com/bitbucket-cloud/docs/runners/)
+
+Additionally, if you’re using the free tier of GitLab you can still follow this guide, but it may ask you to provide a credit card to verify your account. You’ll see something like this the first time you try to run a pipeline:
+
+![Warning from GitLab showing payment information is required](/img/guides/orchestration/custom-cicd-pipelines/gitlab-cicd-payment-warning.png)
-The setup below shows how to call the dbt Cloud API to run a job every time there is a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently).
+### How to setup pipelines
+
+This guide provides details for multiple code hosting platforms. Where steps are unique, they are presented without a selection option. If code is specific to a platform (i.e. GitHub, GitLab, Bitbucket) you will see a selection option for each.
+
+Pipelines can be triggered by various events. The [dbt Cloud webhook](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) process already triggers a run if you want to run your jobs on a merge request, so this guide focuses on running pipelines for every push and when PRs are merged. Since pushes happen frequently in a project, we’ll keep this job super simple and fast by linting with SQLFluff. The pipeline that runs on merge requests will run less frequently, and can be used to call the dbt Cloud API to trigger a specific job. This can be helpful if you have specific requirements that need to happen when code is updated in production, like running a `--full-refresh` on all impacted incremental models.
+
+Here’s a quick look at what this pipeline will accomplish:
+
+![Diagram showing the pipelines to be created and the programs involved](/img/guides/orchestration/custom-cicd-pipelines/pipeline-programs-diagram.png)
+
+## Run a dbt Cloud job on merge
+
+This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts presented here can be generalized and used in whatever way best suits your use case.
+
+The setup below shows how to call the dbt Cloud API to run a job every time there's a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently).
### 1. Get your dbt Cloud API key
@@ -28,7 +79,7 @@ Here’s a video showing the steps as well:
### 2. Put your dbt Cloud API key into your repo
-This next part will happen in you code hosting platform. We need to save your API key from above into a repository secret so the job we create can access it. It is **not** recommended to ever save passwords or API keys in your code, so this step ensures that your key stays secure, but is still usable for your pipelines.
+This next part will happen in you code hosting platform. We need to save your API key from above into a repository secret so the job we create can access it. It is **not** recommended to ever save passwords or API keys in your code, so this step ensures that your key stays secure, but is still usable for your pipelines.
-In GitHub:
-
- Open up your repository where you want to run the pipeline (the same one that houses your dbt project)
- Click *Settings* to open up the repository options
- On the left click the *Security* dropdown
- From that list, click on *Actions*
- Towards the middle of the screen, click the *New repository secret* button
- It will ask you for a name, so let’s call ours `DBT_API_KEY`
- - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
+ - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
- In the *Value* section, paste in the key you copied from dbt Cloud
- Click *Add secret* and you’re all set!
@@ -62,23 +111,21 @@ Here’s a video showing these steps:
-In GitLab:
-
- Open up your repository where you want to run the pipeline (the same one that houses your dbt project)
- Click *Settings* > *CI/CD*
- Under the *Variables* section, click *Expand,* then click *Add variable*
- It will ask you for a name, so let’s call ours `DBT_API_KEY`
- - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
+ - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
- In the *Value* section, paste in the key you copied from dbt Cloud
- Make sure the check box next to *Protect variable* is unchecked, and the box next to *Mask variable* is selected (see below)
- - “Protected” means that the variable is only available in pipelines that run on protected branches or protected tags - that won’t work for us because we want to run this pipeline on multiple branches. “Masked” means that it will be available to your pipeline runner, but will be masked in the logs.
-
+ - “Protected” means that the variable is only available in pipelines that run on protected branches or protected tags - that won’t work for us because we want to run this pipeline on multiple branches. “Masked” means that it will be available to your pipeline runner, but will be masked in the logs.
+
![View of the GitLab window for entering DBT_API_KEY](/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-gitlab.png)
-
+
Here’s a video showing these steps:
-
+
-
+
@@ -91,7 +138,7 @@ In Azure:
- Select *Starter pipeline* (this will be updated later in Step 4)
- Click on *Variables* and then *New variable*
- In the *Name* field, enter the `DBT_API_KEY`
- - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
+ - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
- In the *Value* section, paste in the key you copied from dbt Cloud
- Make sure the check box next to *Keep this value secret* is checked. This will mask the value in logs, and you won't be able to see the value for the variable in the UI.
- Click *OK* and then *Save* to save the variable
@@ -99,7 +146,7 @@ In Azure:
-
+
In Bitbucket:
@@ -108,16 +155,16 @@ In Bitbucket:
- In the left menu, click *Repository Settings*
- Scroll to the bottom of the left menu, and select *Repository variables*
- In the *Name* field, input `DBT_API_KEY`
- - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
+ - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.**
- In the *Value* section, paste in the key you copied from dbt Cloud
- Make sure the check box next to *Secured* is checked. This will mask the value in logs, and you won't be able to see the value for the variable in the UI.
- Click *Add* to save the variable
-
+
![View of the Bitbucket window for entering DBT_API_KEY](/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-bitbucket.png)
-
+
Here’s a video showing these steps:
-
+
@@ -304,13 +351,12 @@ run-dbt-cloud-job:
-
For this new job, open the existing Azure pipeline you created above and select the *Edit* button. We'll want to edit the corresponding Azure pipeline YAML file with the appropriate configuration, instead of the starter code, along with including a `variables` section to pass in the required variables.
-Copy the below YAML file into your Azure pipeline and update the variables below to match your setup based on the comments in the file. It's worth noting that we changed the `trigger` section so that it will run **only** when there are pushes to a branch named `main` (like a PR merged to your main branch).
+Copy the below YAML file into your Azure pipeline and update the variables below to match your setup based on the comments in the file. It's worth noting that we changed the `trigger` section so that it will run **only** when there are pushes to a branch named `main` (like a PR merged to your main branch).
Read through [Azure's docs](https://learn.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops) on these filters for additional use cases.
@@ -406,13 +452,12 @@ pipelines:
-
### 5. Test your new action
-Now that you have a shiny new action, it’s time to test it out! Since this change is setup to only run on merges to your default branch, you’ll need to create and merge this change into your main branch. Once you do that, you’ll see a new pipeline job has been triggered to run the dbt Cloud job you assigned in the variables section.
+Now that you have a shiny new action, it’s time to test it out! Since this change is setup to only run on merges to your default branch, you’ll need to create and merge this change into your main branch. Once you do that, you’ll see a new pipeline job has been triggered to run the dbt Cloud job you assigned in the variables section.
Additionally, you’ll see the job in the run history of dbt Cloud. It should be fairly easy to spot because it will say it was triggered by the API, and the *INFO* section will have the branch you used for this guide.
@@ -454,3 +499,140 @@ Additionally, you’ll see the job in the run history of dbt Cloud. It should be
+
+## Run a dbt Cloud job on pull request
+
+If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created.
+
+:::info Run on PR
+
+If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/ci-jobs).
+This section is only for those projects that connect to their git repository using an SSH key.
+
+:::
+
+The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-5 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)**
+
+### 1. Create a pipeline job that runs when PRs are created
+
+
+
+
+For this job, we'll set it up using the `bitbucket-pipelines.yml` file as in the prior step. The YAML file will look pretty similar to our earlier job, but we’ll pass in the required variables to the Python script using `export` statements. Update this section to match your setup based on the comments in the file.
+
+**What is this pipeline going to do?**
+The setup below will trigger a dbt Cloud job to run every time a PR is opened in this repository. It will also run a fresh version of the pipeline for every commit that is made on the PR until it is merged.
+For example: If you open a PR, it will run the pipeline. If you then decide additional changes are needed, and commit/push to the PR branch, a new pipeline will run with the updated code.
+
+The following varibles control this job:
+
+- `DBT_JOB_BRANCH`: Tells the dbt Cloud job to run the code in the branch that created this PR
+- `DBT_JOB_SCHEMA_OVERRIDE`: Tells the dbt Cloud job to run this into a custom target schema
+ - The format of this will look like: `DBT_CLOUD_PR_{REPO_KEY}_{PR_NUMBER}`
+
+```yaml
+image: python:3.11.1
+
+
+pipelines:
+ # This job will run when pull requests are created in the repository
+ pull-requests:
+ '**':
+ - step:
+ name: 'Run dbt Cloud PR Job'
+ script:
+ # Check to only build if PR destination is master (or other branch).
+ # Comment or remove line below if you want to run on all PR's regardless of destination branch.
+ - if [ "${BITBUCKET_PR_DESTINATION_BRANCH}" != "main" ]; then printf 'PR Destination is not master, exiting.'; exit; fi
+ - export DBT_URL="https://cloud.getdbt.com"
+ - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job"
+ - export DBT_JOB_BRANCH=$BITBUCKET_BRANCH
+ - export DBT_JOB_SCHEMA_OVERRIDE="DBT_CLOUD_PR_"$BITBUCKET_PROJECT_KEY"_"$BITBUCKET_PR_ID
+ - export DBT_ACCOUNT_ID=00000 # enter your account id here
+ - export DBT_PROJECT_ID=00000 # enter your project id here
+ - export DBT_PR_JOB_ID=00000 # enter your job id here
+ - python python/run_and_monitor_dbt_job.py
+```
+
+
+
+
+### 2. Confirm the pipeline runs
+
+Now that you have a new pipeline, it's time to run it and make sure it works. Since this only triggers when a PR is created, you'll need to create a new PR on a branch that contains the code above. Once you do that, you should see a pipeline that looks like this:
+
+
+
+
+Bitbucket pipeline:
+![dbt run on PR job in Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png)
+
+dbt Cloud job:
+![dbt Cloud job showing it was triggered by Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png)
+
+
+
+
+### 3. Handle those extra schemas in your database
+
+As noted above, when the PR job runs it will create a new schema based on the PR. To avoid having your database overwhelmed with PR schemas, consider adding a "cleanup" job to your dbt Cloud account. This job can run on a scheduled basis to cleanup any PR schemas that haven't been updated/used recently.
+
+Add this as a macro to your project. It takes 2 arguments that lets you control which schema get dropped:
+
+- `age_in_days`: The number of days since the schema was last altered before it should be dropped (default 10 days)
+- `database_to_clean`: The name of the database to remove schemas from
+
+```sql
+{#
+ This macro finds PR schemas older than a set date and drops them
+ The macro defaults to 10 days old, but can be configured with the input argument age_in_days
+ Sample usage with different date:
+ dbt run-operation pr_schema_cleanup --args "{'database_to_clean': 'analytics','age_in_days':'15'}"
+#}
+{% macro pr_schema_cleanup(database_to_clean, age_in_days=10) %}
+
+ {% set find_old_schemas %}
+ select
+ 'drop schema {{ database_to_clean }}.'||schema_name||';'
+ from {{ database_to_clean }}.information_schema.schemata
+ where
+ catalog_name = '{{ database_to_clean | upper }}'
+ and schema_name ilike 'DBT_CLOUD_PR%'
+ and last_altered <= (current_date() - interval '{{ age_in_days }} days')
+ {% endset %}
+
+ {% if execute %}
+
+ {{ log('Schema drop statements:' ,True) }}
+
+ {% set schema_drop_list = run_query(find_old_schemas).columns[0].values() %}
+
+ {% for schema_to_drop in schema_drop_list %}
+ {% do run_query(schema_to_drop) %}
+ {{ log(schema_to_drop ,True) }}
+ {% endfor %}
+
+ {% endif %}
+
+{% endmacro %}
+```
+
+This macro goes into a dbt Cloud job that is run on a schedule. The command will look like this (text below for copy/paste):
+![dbt Cloud job showing the run operation command for the cleanup macro](/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png)
+`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"`
+
+## Consider risk of conflicts when using multiple orchestration tools
+
+Running dbt Cloud jobs through a CI/CD pipeline is a form of job orchestration. If you also run jobs using dbt Cloud’s built in scheduler, you now have 2 orchestration tools running jobs. The risk with this is that you could run into conflicts - you can imagine a case where you are triggering a pipeline on certain actions and running scheduled jobs in dbt Cloud, you would probably run into job clashes. The more tools you have, the more you have to make sure everything talks to each other.
+
+That being said, if **the only reason you want to use pipelines is for adding a lint check or run on merge**, you might decide the pros outweigh the cons, and as such you want to go with a hybrid approach. Just keep in mind that if two processes try and run the same job at the same time, dbt Cloud will queue the jobs and run one after the other. It’s a balancing act but can be accomplished with diligence to ensure you’re orchestrating jobs in a manner that does not conflict.
diff --git a/website/docs/quickstarts/databricks-qs.md b/website/docs/guides/databricks-qs.md
similarity index 98%
rename from website/docs/quickstarts/databricks-qs.md
rename to website/docs/guides/databricks-qs.md
index 1222ef2a7d5..5a0c5536e7f 100644
--- a/website/docs/quickstarts/databricks-qs.md
+++ b/website/docs/guides/databricks-qs.md
@@ -1,9 +1,11 @@
---
title: "Quickstart for dbt Cloud and Databricks"
id: "databricks"
-platform: 'dbt-cloud'
+level: 'Beginner'
icon: 'databricks'
hide_table_of_contents: true
+recently_updated: true
+tags: ['dbt Cloud', 'Quickstart','Databricks']
---
## Introduction
@@ -30,8 +32,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen
### Related content
- Learn more with [dbt Courses](https://courses.getdbt.com/collections)
-- [dbt Cloud CI job](/docs/deploy/continuous-integration)
-- [Job triggers](/docs/deploy/job-triggers)
+- [CI jobs](/docs/deploy/continuous-integration)
+- [Deploy jobs](/docs/deploy/deploy-jobs)
- [Job notifications](/docs/deploy/job-notifications)
- [Source freshness](/docs/deploy/source-freshness)
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md b/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md
deleted file mode 100644
index 0959dbee707..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md
+++ /dev/null
@@ -1,100 +0,0 @@
----
-title: "What are adapters? Why do we need them?"
-id: "1-what-are-adapters"
----
-
-Adapters are an essential component of dbt. At their most basic level, they are how dbt Core connects with the various supported data platforms. At a higher-level, dbt Core adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt Core.
-
- Navigating and developing around the nuances of different databases can be daunting, but you are not alone. Visit [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel for additional help beyond the documentation.
-
-## All databases are not the same
-
-There's a tremendous amount of work that goes into creating a database. Here is a high-level list of typical database layers (from the outermost layer moving inwards):
-- SQL API
-- Client Library / Driver
-- Server Connection Manager
-- Query parser
-- Query optimizer
-- Runtime
-- Storage Access Layer
-- Storage
-
-There's a lot more there than just SQL as a language. Databases (and data warehouses) are so popular because you can abstract away a great deal of the complexity from your brain to the database itself. This enables you to focus more on the data.
-
-dbt allows for further abstraction and standardization of the outermost layers of a database (SQL API, client library, connection manager) into a framework that both:
- - Opens database technology to less technical users (a large swath of a DBA's role has been automated, similar to how the vast majority of folks with websites today no longer have to be "[webmasters](https://en.wikipedia.org/wiki/Webmaster)").
- - Enables more meaningful conversations about how data warehousing should be done.
-
-This is where dbt adapters become critical.
-
-## What needs to be adapted?
-
-dbt adapters are responsible for _adapting_ dbt's standard functionality to a particular database. Our prototypical database and adapter are PostgreSQL and dbt-postgres, and most of our adapters are somewhat based on the functionality described in dbt-postgres.
-
-Connecting dbt to a new database will require a new adapter to be built or an existing adapter to be extended.
-
-The outermost layers of a database map roughly to the areas in which the dbt adapter framework encapsulates inter-database differences.
-
-### SQL API
-
-Even amongst ANSI-compliant databases, there are differences in the SQL grammar.
-Here are some categories and examples of SQL statements that can be constructed differently:
-
-
-| Category | Area of differences | Examples |
-|----------------------------------------------|--------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Statement syntax | The use of `IF EXISTS` | - `IF
EXISTS, DROP TABLE`- `DROP
IF EXISTS` |
-| Workflow definition & semantics | Incremental updates | - `MERGE`
- `DELETE; INSERT`
|
-| Relation and column attributes/configuration | Database-specific materialization configs | - `DIST = ROUND_ROBIN` (Synapse)
- `DIST = EVEN` (Redshift)
|
-| Permissioning | Grant statements that can only take one grantee at a time vs those that accept lists of grantees | - `grant SELECT on table dinner.corn to corn_kid, everyone`
- `grant SELECT on table dinner.corn to corn_kid; grant SELECT on table dinner.corn to everyone`
|
-
-### Python Client Library & Connection Manager
-
-The other big category of inter-database differences comes with how the client connects to the database and executes queries against the connection. To integrate with dbt, a data platform must have a pre-existing python client library or support ODBC, using a generic python library like pyodbc.
-
-| Category | Area of differences | Examples |
-|------------------------------|-------------------------------------------|-------------------------------------------------------------------------------------------------------------|
-| Credentials & authentication | Authentication | - Username & password
- MFA with `boto3` or Okta token
|
-| Connection opening/closing | Create a new connection to db |- `psycopg2.connect(connection_string)`
- `google.cloud.bigquery.Client(...)`
|
-| Inserting local data | Load seed .`csv` files into Python memory |- `google.cloud.bigquery.Client.load_table_from_file(...)` (BigQuery)
- `INSERT ... INTO VALUES ...` prepared statement (most other databases)
|
-
-
-## How dbt encapsulates and abstracts these differences
-
-Differences between databases are encoded into discrete areas:
-
-| Components | Code Path | Function |
-|------------------|---------------------------------------------------|-------------------------------------------------------------------------------|
-| Python Classes | `adapters/` | Configuration (See above [Python classes](##python classes) |
-| Macros | `include//macros/adapters/` | SQL API & statement syntax (for example, how to create schema or how to get table info) |
-| Materializations | `include//macros/materializations/` | Table/view/snapshot/ workflow definitions |
-
-
-### Python Classes
-
-These classes implement all the methods responsible for:
-- Connecting to a database and issuing queries.
-- Providing dbt with database-specific configuration information.
-
-| Class | Description |
-|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| AdapterClass | High-level configuration type conversion and any database-specific python methods needed |
-| AdapterCredentials | Typed dictionary of possible profiles and associated methods |
-| AdapterConnectionManager | All the methods responsible for connecting to a database and issuing queries |
-| AdapterRelation | How relation names should be rendered, printed, and quoted. Do relation names use all three parts? `catalog.model_name` (two-part name) or `database.schema.model_name` (three-part name) |
-| AdapterColumn | How names should be rendered, and database-specific properties |
-
-### Macros
-
-A set of *macros* responsible for generating SQL that is compliant with the target database.
-
-### Materializations
-
-A set of *materializations* and their corresponding helper macros defined in dbt using jinja and SQL. They codify for dbt how model files should be persisted into the database.
-
-## Adapter Architecture
-
-
-Below is a diagram of how dbt-postgres, the adapter at the center of dbt-core, works.
-
-
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md
deleted file mode 100644
index 28cd8935937..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md
+++ /dev/null
@@ -1,52 +0,0 @@
----
-title: "Prerequisites for a new adapter"
-id: "2-prerequisites-for-a-new-adapter"
----
-
-To learn what an adapter is and they role they serve, see [What are adapters?](1-what-are-adapters)
-
-It is very important that make sure that you have the right skills, and to understand the level of difficulty required to make an adapter for your data platform.
-
-## Pre-Requisite Data Warehouse Features
-
-The more you can answer Yes to the below questions, the easier your adapter development (and user-) experience will be. See the [New Adapter Information Sheet wiki](https://github.com/dbt-labs/dbt-core/wiki/New-Adapter-Information-Sheet) for even more specific questions.
-
-### Training
-- the developer (and any product managers) ideally will have substantial experience as an end-user of dbt. If not, it is highly advised that you at least take the [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) and [Advanced Materializations](https://courses.getdbt.com/courses/advanced-materializations) course.
-
-### Database
-- Does the database complete transactions fast enough for interactive development?
-- Can you execute SQL against the data platform?
-- Is there a concept of schemas?
-- Does the data platform support ANSI SQL, or at least a subset?
-### Driver / Connection Library
-- Is there a Python-based driver for interacting with the database that is db API 2.0 compliant (e.g. Psycopg2 for Postgres, pyodbc for SQL Server)
-- Does it support: prepared statements, multiple statements, or single sign on token authorization to the data platform?
-
-### Open source software
-- Does your organization have an established process for publishing open source software?
-
-
-It is easiest to build an adapter for dbt when the following the /platform in question has:
-- a conventional ANSI-SQL interface (or as close to it as possible),
-- a mature connection library/SDK that uses ODBC or Python DB 2 API, and
-- a way to enable developers to iterate rapidly with both quick reads and writes
-
-
-## Maintaining your new adapter
-
-When your adapter becomes more popular, and people start using it, you may quickly become the maintainer of an increasingly popular open source project. With this new role, comes some unexpected responsibilities that not only include code maintenance, but also working with a community of users and contributors. To help people understand what to expect of your project, you should communicate your intentions early and often in your adapter documentation or README. Answer questions like, Is this experimental work that people should use at their own risk? Or is this production-grade code that you're committed to maintaining into the future?
-
-### Keeping the code compatible with dbt Core
-
-New minor version releases of `dbt-core` may include changes to the Python interface for adapter plugins, as well as new or updated test cases. The maintainers of `dbt-core` will clearly communicate these changes in documentation and release notes, and they will aim for backwards compatibility whenever possible.
-
-Patch releases of `dbt-core` will _not_ include breaking changes to adapter-facing code. For more details, see ["About dbt Core versions"](/docs/dbt-versions/core).
-
-### Versioning and releasing your adapter
-
-We strongly encourage you to adopt the following approach when versioning and releasing your plugin:
-- The minor version of your plugin should match the minor version in `dbt-core` (e.g. 1.1.x).
-- Aim to release a new version of your plugin for each new minor version of `dbt-core` (once every three months).
-- While your plugin is new, and you're iterating on features, aim to offer backwards compatibility and deprecation notices for at least one minor version. As your plugin matures, aim to leave backwards compatibility and deprecation notices in place until the next major version (dbt Core v2).
-- Release patch versions of your plugins whenever needed. These patch releases should contain fixes _only_.
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md
deleted file mode 100644
index 43826ca4b1d..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md
+++ /dev/null
@@ -1,416 +0,0 @@
----
-title: "Building a new adapter"
-id: "3-building-a-new-adapter"
----
-
-:::tip
-Before you build your adapter, we strongly encourage you to first learn dbt as an end user, learn [what an adapter is and the role they serve](1-what-are-adapters), as well as [data platform prerequisites](2-prerequisites-for-a-new-adapter)
-:::
-
-
-This guide will walk you through the first creating the necessary adapter classes and macros, and provide some resources to help you validate that your new adapter is working correctly. Once the adapter is passing most of the functional tests (see ["Testing a new adapter"](4-testing-a-new-adapter)
-), please let the community know that is available to use by adding the adapter to the ["Supported Data Platforms"](/docs/supported-data-platforms) page by following the steps given in [Documenting your adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter).
-
-For any questions you may have, don't hesitate to ask in the [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel. The community is very helpful and likely has experienced a similar issue as you.
-
-## Scaffolding a new adapter
- To create a new adapter plugin from scratch, you can use the [dbt-database-adapter-scaffold](https://github.com/dbt-labs/dbt-database-adapter-scaffold) to trigger an interactive session which will generate a scaffolding for you to build upon.
-
- Example usage:
-
- ```
- $ cookiecutter gh:dbt-labs/dbt-database-adapter-scaffold
- ```
-
-The generated boilerplate starting project will include a basic adapter plugin file structure, examples of macros, high level method descriptions, etc.
-
-One of the most important choices you will make during the cookiecutter generation will revolve around the field for `is_sql_adapter` which is a boolean used to correctly apply imports for either a `SQLAdapter` or `BaseAdapter`. Knowing which you will need requires a deeper knowledge of your selected database but a few good guides for the choice are.
-- Does your database have a complete SQL API? Can it perform tasks using SQL such as creating schemas, dropping schemas, querying an `information_schema` for metadata calls? If so, it is more likely to be a SQLAdapter where you set `is_sql_adapter` to `True`.
-- Most adapters do fall under SQL adapters which is why we chose it as the default `True` value.
-- It is very possible to build out a fully functional `BaseAdapter`. This will require a little more ground work as it doesn't come with some prebuilt methods the `SQLAdapter` class provides. See `dbt-bigquery` as a good guide.
-
-## Implementation Details
-
-Regardless if you decide to use the cookiecutter template or manually create the plugin, this section will go over each method that is required to be implemented. The table below provides a high-level overview of the classes, methods, and macros you may have to define for your data platform.
-
-| file | component | purpose |
-|---------------------------------------------------|-------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `./setup.py` | `setup()` function | adapter meta-data (package name, version, author, homepage, etc) |
-| `myadapter/dbt/adapters/myadapter/__init__.py` | `AdapterPlugin` | bundle all the information below into a dbt plugin |
-| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterCredentials` class | parameters to connect to and configure the database, via a the chosen Python driver |
-| `myadapter/dbt/adapters/myadapter/connections.py` | `MyAdapterConnectionManager` class | telling dbt how to interact with the database w.r.t opening/closing connections, executing queries, and fetching data. Effectively a wrapper around the db API or driver. |
-| `myadapter/dbt/include/bigquery/` | a dbt project of macro "overrides" in the format of "myadapter__" | any differences in SQL syntax for regular db operations will be modified here from the global_project (e.g. "Create Table As Select", "Get all relations in the current schema", etc) |
-| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterConfig` | database- and relation-level configs and |
-| `myadapter/dbt/adapters/myadapter/impl.py` | `MyAdapterAdapter` | for changing _how_ dbt performs operations like macros and other needed Python functionality |
-| `myadapter/dbt/adapters/myadapter/column.py` | `MyAdapterColumn` | for defining database-specific column such as datatype mappings |
-
-### Editing `setup.py`
-
-Edit the file at `myadapter/setup.py` and fill in the missing information.
-
-You can skip this step if you passed the arguments for `email`, `url`, `author`, and `dependencies` to the cookiecutter template script. If you plan on having nested macro folder structures, you may need to add entries to `package_data` so your macro source files get installed.
-
-### Editing the connection manager
-
-Edit the connection manager at `myadapter/dbt/adapters/myadapter/connections.py`. This file is defined in the sections below.
-
-#### The Credentials class
-
-The credentials class defines all of the database-specific credentials (e.g. `username` and `password`) that users will need in the [connection profile](/docs/supported-data-platforms) for your new adapter. Each credentials contract should subclass dbt.adapters.base.Credentials, and be implemented as a python dataclass.
-
-Note that the base class includes required database and schema fields, as dbt uses those values internally.
-
-For example, if your adapter requires a host, integer port, username string, and password string, but host is the only required field, you'd add definitions for those new properties to the class as types, like this:
-
-
-
-```python
-
-from dataclasses import dataclass
-from typing import Optional
-
-from dbt.adapters.base import Credentials
-
-
-@dataclass
-class MyAdapterCredentials(Credentials):
- host: str
- port: int = 1337
- username: Optional[str] = None
- password: Optional[str] = None
-
- @property
- def type(self):
- return 'myadapter'
-
- @property
- def unique_field(self):
- """
- Hashed and included in anonymous telemetry to track adapter adoption.
- Pick a field that can uniquely identify one team/organization building with this adapter
- """
- return self.host
-
- def _connection_keys(self):
- """
- List of keys to display in the `dbt debug` output.
- """
- return ('host', 'port', 'database', 'username')
-```
-
-
-
-There are a few things you can do to make it easier for users when connecting to your database:
-- Be sure to implement the Credentials' `_connection_keys` method shown above. This method will return the keys that should be displayed in the output of the `dbt debug` command. As a general rule, it's good to return all the arguments used in connecting to the actual database except the password (even optional arguments).
-- Create a `profile_template.yml` to enable configuration prompts for a brand-new user setting up a connection profile via the [`dbt init` command](/reference/commands/init). See more details [below](#other-files).
-- You may also want to define an `ALIASES` mapping on your Credentials class to include any config names you want users to be able to use in place of 'database' or 'schema'. For example if everyone using the MyAdapter database calls their databases "collections", you might do:
-
-
-
-```python
-@dataclass
-class MyAdapterCredentials(Credentials):
- host: str
- port: int = 1337
- username: Optional[str] = None
- password: Optional[str] = None
-
- ALIASES = {
- 'collection': 'database',
- }
-```
-
-
-
-Then users can use `collection` OR `database` in their `profiles.yml`, `dbt_project.yml`, or `config()` calls to set the database.
-
-#### `ConnectionManager` class methods
-
-Once credentials are configured, you'll need to implement some connection-oriented methods. They are enumerated in the SQLConnectionManager docstring, but an overview will also be provided here.
-
-**Methods to implement:**
-- `open`
-- `get_response`
-- `cancel`
-- `exception_handler`
-- `standardize_grants_dict`
-
-##### `open(cls, connection)`
-
-`open()` is a classmethod that gets a connection object (which could be in any state, but will have a `Credentials` object with the attributes you defined above) and moves it to the 'open' state.
-
-Generally this means doing the following:
- - if the connection is open already, log and return it.
- - If a database needed changes to the underlying connection before re-use, that would happen here
- - create a connection handle using the underlying database library using the credentials
- - on success:
- - set connection.state to `'open'`
- - set connection.handle to the handle object
- - this is what must have a `cursor()` method that returns a cursor!
- - on error:
- - set connection.state to `'fail'`
- - set connection.handle to `None`
- - raise a `dbt.exceptions.FailedToConnectException` with the error and any other relevant information
-
-For example:
-
-
-
-```python
- @classmethod
- def open(cls, connection):
- if connection.state == 'open':
- logger.debug('Connection is already open, skipping open.')
- return connection
-
- credentials = connection.credentials
-
- try:
- handle = myadapter_library.connect(
- host=credentials.host,
- port=credentials.port,
- username=credentials.username,
- password=credentials.password,
- catalog=credentials.database
- )
- connection.state = 'open'
- connection.handle = handle
- return connection
-```
-
-
-
-##### `get_response(cls, cursor)`
-
-`get_response` is a classmethod that gets a cursor object and returns adapter-specific information about the last executed command. The return value should be an `AdapterResponse` object that includes items such as `code`, `rows_affected`, `bytes_processed`, and a summary `_message` for logging to stdout.
-
-
-
-```python
- @classmethod
- def get_response(cls, cursor) -> AdapterResponse:
- code = cursor.sqlstate or "OK"
- rows = cursor.rowcount
- status_message = f"{code} {rows}"
- return AdapterResponse(
- _message=status_message,
- code=code,
- rows_affected=rows
- )
-```
-
-
-
-##### `cancel(self, connection)`
-
-`cancel` is an instance method that gets a connection object and attempts to cancel any ongoing queries, which is database dependent. Some databases don't support the concept of cancellation, they can simply implement it via 'pass' and their adapter classes should implement an `is_cancelable` that returns False - On ctrl+c connections may remain running. This method must be implemented carefully, as the affected connection will likely be in use in a different thread.
-
-
-
-```python
- def cancel(self, connection):
- tid = connection.handle.transaction_id()
- sql = 'select cancel_transaction({})'.format(tid)
- logger.debug("Cancelling query '{}' ({})".format(connection_name, pid))
- _, cursor = self.add_query(sql, 'master')
- res = cursor.fetchone()
- logger.debug("Canceled query '{}': {}".format(connection_name, res))
-```
-
-
-
-##### `exception_handler(self, sql, connection_name='master')`
-
-`exception_handler` is an instance method that returns a context manager that will handle exceptions raised by running queries, catch them, log appropriately, and then raise exceptions dbt knows how to handle.
-
-If you use the (highly recommended) `@contextmanager` decorator, you only have to wrap a `yield` inside a `try` block, like so:
-
-
-
-```python
- @contextmanager
- def exception_handler(self, sql: str):
- try:
- yield
- except myadapter_library.DatabaseError as exc:
- self.release(connection_name)
-
- logger.debug('myadapter error: {}'.format(str(e)))
- raise dbt.exceptions.DatabaseException(str(exc))
- except Exception as exc:
- logger.debug("Error running SQL: {}".format(sql))
- logger.debug("Rolling back transaction.")
- self.release(connection_name)
- raise dbt.exceptions.RuntimeException(str(exc))
-```
-
-
-
-##### `standardize_grants_dict(self, grants_table: agate.Table) -> dict`
-
-`standardize_grants_dict` is an method that returns the dbt-standardized grants dictionary that matches how users configure grants now in dbt. The input is the result of `SHOW GRANTS ON {{model}}` call loaded into an agate table.
-
-If there's any massaging of agate table containing the results, of `SHOW GRANTS ON {{model}}`, that can't easily be accomplished in SQL, it can be done here. For example, the SQL to show grants *should* filter OUT any grants TO the current user/role (e.g. OWNERSHIP). If that's not possible in SQL, it can be done in this method instead.
-
-
-
-```python
- @available
- def standardize_grants_dict(self, grants_table: agate.Table) -> dict:
- """
- :param grants_table: An agate table containing the query result of
- the SQL returned by get_show_grant_sql
- :return: A standardized dictionary matching the `grants` config
- :rtype: dict
- """
- grants_dict: Dict[str, List[str]] = {}
- for row in grants_table:
- grantee = row["grantee"]
- privilege = row["privilege_type"]
- if privilege in grants_dict.keys():
- grants_dict[privilege].append(grantee)
- else:
- grants_dict.update({privilege: [grantee]})
- return grants_dict
-```
-
-
-
-### Editing the adapter implementation
-
-Edit the connection manager at `myadapter/dbt/adapters/myadapter/impl.py`
-
-Very little is required to implement the adapter itself. On some adapters, you will not need to override anything. On others, you'll likely need to override some of the ``convert_*`` classmethods, or override the `is_cancelable` classmethod on others to return `False`.
-
-
-#### `datenow()`
-
-This classmethod provides the adapter's canonical date function. This is not used but is required– anyway on all adapters.
-
-
-
-```python
- @classmethod
- def date_function(cls):
- return 'datenow()'
-```
-
-
-
-### Editing SQL logic
-
-dbt implements specific SQL operations using jinja macros. While reasonable defaults are provided for many such operations (like `create_schema`, `drop_schema`, `create_table`, etc), you may need to override one or more of macros when building a new adapter.
-
-#### Required macros
-
-The following macros must be implemented, but you can override their behavior for your adapter using the "dispatch" pattern described below. Macros marked (required) do not have a valid default implementation, and are required for dbt to operate.
-
-- `alter_column_type` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L37-L55))
-- `check_schema_exists` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L43-L55))
-- `create_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L1-L9))
-- `drop_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L34-L42))
-- `drop_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L12-L20))
-- `get_columns_in_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L1-L8)) (required)
-- `list_relations_without_caching` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L58-L65)) (required)
-- `list_schemas` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L29-L40))
-- `rename_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L56-L65))
-- `truncate_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L45-L53))
-- `current_timestamp` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/freshness.sql#L1-L8)) (required)
-- `copy_grants`
-
-#### Adapter dispatch
-
-Most modern databases support a majority of the standard SQL spec. There are some databases that _do not_ support critical aspects of the SQL spec however, or they provide their own nonstandard mechanisms for implementing the same functionality. To account for these variations in SQL support, dbt provides a mechanism called [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch) for macros. With this feature, macros can be overridden for specific adapters. This makes it possible to implement high-level methods (like "create ") in a database-specific way.
-
-
-
-```jinja2
-
-{# dbt will call this macro by name, providing any arguments #}
-{% macro create_table_as(temporary, relation, sql) -%}
-
- {# dbt will dispatch the macro call to the relevant macro #}
- {{ return(
- adapter.dispatch('create_table_as')(temporary, relation, sql)
- ) }}
-{%- endmacro %}
-
-
-
-{# If no macro matches the specified adapter, "default" will be used #}
-{% macro default__create_table_as(temporary, relation, sql) -%}
- ...
-{%- endmacro %}
-
-
-
-{# Example which defines special logic for Redshift #}
-{% macro redshift__create_table_as(temporary, relation, sql) -%}
- ...
-{%- endmacro %}
-
-
-
-{# Example which defines special logic for BigQuery #}
-{% macro bigquery__create_table_as(temporary, relation, sql) -%}
- ...
-{%- endmacro %}
-```
-
-
-
-The `adapter.dispatch()` macro takes a second argument, `packages`, which represents a set of "search namespaces" in which to find potential implementations of a dispatched macro. This allows users of community-supported adapters to extend or "shim" dispatched macros from common packages, such as `dbt-utils`, with adapter-specific versions in their own project or other installed packages. See:
-- "Shim" package examples: [`spark-utils`](https://github.com/dbt-labs/spark-utils), [`tsql-utils`](https://github.com/dbt-msft/tsql-utils)
-- [`adapter.dispatch` docs](/reference/dbt-jinja-functions/dispatch)
-
-#### Overriding adapter methods
-
-While much of dbt's adapter-specific functionality can be modified in adapter macros, it can also make sense to override adapter methods directly. In this example, assume that a database does not support a `cascade` parameter to `drop schema`. Instead, we can implement an approximation where we drop each relation and then drop the schema.
-
-
-
-```python
- def drop_schema(self, relation: BaseRelation):
- relations = self.list_relations(
- database=relation.database,
- schema=relation.schema
- )
- for relation in relations:
- self.drop_relation(relation)
- super().drop_schema(relation)
-```
-
-
-
-#### Grants Macros
-
-See [this GitHub discussion](https://github.com/dbt-labs/dbt-core/discussions/5468) for information on the macros required for `GRANT` statements:
-### Other files
-
-#### `profile_template.yml`
-
-In order to enable the [`dbt init` command](/reference/commands/init) to prompt users when setting up a new project and connection profile, you should include a **profile template**. The filepath needs to be `dbt/include//profile_template.yml`. It's possible to provide hints, default values, and conditional prompts based on connection methods that require different supporting attributes. Users will also be able to include custom versions of this file in their own projects, with fixed values specific to their organization, to support their colleagues when using your dbt adapter for the first time.
-
-See examples:
-- [dbt-postgres](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml)
-- [dbt-redshift](https://github.com/dbt-labs/dbt-redshift/blob/main/dbt/include/redshift/profile_template.yml)
-- [dbt-snowflake](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/include/snowflake/profile_template.yml)
-- [dbt-bigquery](https://github.com/dbt-labs/dbt-bigquery/blob/main/dbt/include/bigquery/profile_template.yml)
-
-#### `__version__.py`
-
-To assure that `dbt --version` provides the latest dbt core version the adapter supports, be sure include a `__version__.py` file. The filepath will be `dbt/adapters//__version__.py`. We recommend using the latest dbt core version and as the adapter is made compatible with later versions, this file will need to be updated. For a sample file, check out this [example](https://github.com/dbt-labs/dbt-snowflake/blob/main/dbt/adapters/snowflake/__version__.py).
-
-It should be noted that both of these files are included in the bootstrapped output of the `dbt-database-adapter-scaffold` so when using the scaffolding, these files will be included.
-
-## Testing your new adapter
-
-This has moved to its own page: ["Testing a new adapter"](4-testing-a-new-adapter)
-
-## Documenting your new adapter
-
-This has moved to its own page: ["Documenting a new adapter"](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter)
-
-## Maintaining your new adapter
-
-This has moved to a new spot: ["Maintaining your new adapter"](2-prerequisites-for-a-new-adapter##maintaining-your-new-adapter)
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md
deleted file mode 100644
index b1b5072670a..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md
+++ /dev/null
@@ -1,499 +0,0 @@
----
-title: "Testing a new adapter"
-id: "4-testing-a-new-adapter"
----
-
-:::info
-
-Previously, we offered a packaged suite of tests for dbt adapter functionality: [`pytest-dbt-adapter`](https://github.com/dbt-labs/dbt-adapter-tests). We are deprecating that suite, in favor of the newer testing framework outlined in this document.
-
-:::
-
-This document has two sections:
-
-1. "[About the testing framework](#about-the-testing-framework)" describes the standard framework that we maintain for using pytest together with dbt. It includes an example that shows the anatomy of a simple test case.
-2. "[Testing your adapter](#testing-your-adapter)" offers a step-by-step guide for using our out-of-the-box suite of "basic" tests, which will validate that your adapter meets a baseline of dbt functionality.
-
-## Prerequisites
-
-- Your adapter must be compatible with dbt-core **v1.1** or newer
-- You should be familiar with **pytest**: https://docs.pytest.org/
-
-## About the testing framework
-
-dbt-core offers a standard framework for running pre-built functional tests, and for defining your own tests. The core testing framework is built using `pytest`, a mature and standard library for testing Python projects.
-
-The **[`tests` module](https://github.com/dbt-labs/dbt-core/tree/HEAD/core/dbt/tests)** within `dbt-core` includes basic utilities for setting up pytest + dbt. These are used by all "pre-built" functional tests, and make it possible to quickly write your own tests.
-
-Those utilities allow you to do three basic things:
-1. **Quickly set up a dbt "project."** Define project resources via methods such as `models()` and `seeds()`. Use `project_config_update()` to pass configurations into `dbt_project.yml`.
-2. **Define a sequence of dbt commands.** The most important utility is `run_dbt()`, which returns the [results](/reference/dbt-classes#result-objects) of each dbt command. It takes a list of CLI specifiers (subcommand + flags), as well as an optional second argument, `expect_pass=False`, for cases where you expect the command to fail.
-3. **Validate the results of those dbt commands.** For example, `check_relations_equal()` asserts that two database objects have the same structure and content. You can also write your own `assert` statements, by inspecting the results of a dbt command, or querying arbitrary database objects with `project.run_sql()`.
-
-You can see the full suite of utilities, with arguments and annotations, in [`util.py`](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/tests/util.py). You'll also see them crop up across a number of test cases. While all utilities are intended to be reusable, you won't need all of them for every test. In the example below, we'll show a simple test case that uses only a few utilities.
-
-### Example: a simple test case
-
-This example will show you the anatomy of a test case using dbt + pytest. We will create reusable components, combine them to form a dbt "project", and define a sequence of dbt commands. Then, we'll use Python `assert` statements to ensure those commands succeed (or fail) as we expect.
-
-In ["Getting started running basic tests,"](#getting-started-running-basic-tests) we'll offer step-by-step instructions for installing and configuring `pytest`, so that you can run it on your own machine. For now, it's more important to see how the pieces of a test case fit together.
-
-This example includes a seed, a model, and two tests—one of which will fail.
-
-1. Define Python strings that will represent the file contents in your dbt project. Defining these in a separate file enables you to reuse the same components across different test cases. The pytest name for this type of reusable component is "fixture."
-
-
-
-```python
-# seeds/my_seed.csv
-my_seed_csv = """
-id,name,some_date
-1,Easton,1981-05-20T06:46:51
-2,Lillian,1978-09-03T18:10:33
-3,Jeremiah,1982-03-11T03:59:51
-4,Nolan,1976-05-06T20:21:35
-""".lstrip()
-
-# models/my_model.sql
-my_model_sql = """
-select * from {{ ref('my_seed') }}
-union all
-select null as id, null as name, null as some_date
-"""
-
-# models/my_model.yml
-my_model_yml = """
-version: 2
-models:
- - name: my_model
- columns:
- - name: id
- tests:
- - unique
- - not_null # this test will fail
-"""
-```
-
-
-
-2. Use the "fixtures" to define the project for your test case. These fixtures are always scoped to the **class**, where the class represents one test case—that is, one dbt project or scenario. (The same test case can be used for one or more actual tests, which we'll see in step 3.) Following the default pytest configurations, the file name must begin with `test_`, and the class name must begin with `Test`.
-
-
-
-```python
-import pytest
-from dbt.tests.util import run_dbt
-
-# our file contents
-from tests.functional.example.fixtures import (
- my_seed_csv,
- my_model_sql,
- my_model_yml,
-)
-
-# class must begin with 'Test'
-class TestExample:
- """
- Methods in this class will be of two types:
- 1. Fixtures defining the dbt "project" for this test case.
- These are scoped to the class, and reused for all tests in the class.
- 2. Actual tests, whose names begin with 'test_'.
- These define sequences of dbt commands and 'assert' statements.
- """
-
- # configuration in dbt_project.yml
- @pytest.fixture(scope="class")
- def project_config_update(self):
- return {
- "name": "example",
- "models": {"+materialized": "view"}
- }
-
- # everything that goes in the "seeds" directory
- @pytest.fixture(scope="class")
- def seeds(self):
- return {
- "my_seed.csv": my_seed_csv,
- }
-
- # everything that goes in the "models" directory
- @pytest.fixture(scope="class")
- def models(self):
- return {
- "my_model.sql": my_model_sql,
- "my_model.yml": my_model_yml,
- }
-
- # continues below
-```
-
-
-
-3. Now that we've set up our project, it's time to define a sequence of dbt commands and assertions. We define one or more methods in the same file, on the same class (`TestExampleFailingTest`), whose names begin with `test_`. These methods share the same setup (project scenario) from above, but they can be run independently by pytest—so they shouldn't depend on each other in any way.
-
-
-
-```python
- # continued from above
-
- # The actual sequence of dbt commands and assertions
- # pytest will take care of all "setup" + "teardown"
- def test_run_seed_test(self, project):
- """
- Seed, then run, then test. We expect one of the tests to fail
- An alternative pattern is to use pytest "xfail" (see below)
- """
- # seed seeds
- results = run_dbt(["seed"])
- assert len(results) == 1
- # run models
- results = run_dbt(["run"])
- assert len(results) == 1
- # test tests
- results = run_dbt(["test"], expect_pass = False) # expect failing test
- assert len(results) == 2
- # validate that the results include one pass and one failure
- result_statuses = sorted(r.status for r in results)
- assert result_statuses == ["fail", "pass"]
-
- @pytest.mark.xfail
- def test_build(self, project):
- """Expect a failing test"""
- # do it all
- results = run_dbt(["build"])
-```
-
-
-
-3. Our test is ready to run! The last step is to invoke `pytest` from your command line. We'll walk through the actual setup and configuration of `pytest` in the next section.
-
-
-
-```sh
-$ python3 -m pytest tests/functional/test_example.py
-=========================== test session starts ============================
-platform ... -- Python ..., pytest-..., pluggy-...
-rootdir: ...
-plugins: ...
-
-tests/functional/test_example.py .X [100%]
-
-======================= 1 passed, 1 xpassed in 1.38s =======================
-```
-
-
-
-You can find more ways to run tests, along with a full command reference, in the [pytest usage docs](https://docs.pytest.org/how-to/usage.html).
-
-We've found the `-s` flag (or `--capture=no`) helpful to print logs from the underlying dbt invocations, and to step into an interactive debugger if you've added one. You can also use environment variables to set [global dbt configs](/reference/global-configs/about-global-configs), such as `DBT_DEBUG` (to show debug-level logs).
-
-## Testing your adapter
-
-Anyone who installs `dbt-core`, and wishes to define their own test cases, can use the framework presented in the first section. The framework is especially useful for testing standard dbt behavior across different databases.
-
-To that end, we have built and made available a [package of reusable adapter test cases](https://github.com/dbt-labs/dbt-core/tree/HEAD/tests/adapter), for creators and maintainers of adapter plugins. These test cases cover basic expected functionality, as well as functionality that frequently requires different implementations across databases.
-
-For the time being, this package is also located within the `dbt-core` repository, but separate from the `dbt-core` Python package.
-
-### Categories of tests
-
-In the course of creating and maintaining your adapter, it's likely that you will end up implementing tests that fall into three broad categories:
-
-1. **Basic tests** that every adapter plugin is expected to pass. These are defined in `tests.adapter.basic`. Given differences across data platforms, these may require slight modification or reimplementation. Significantly overriding or disabling these tests should be with good reason, since each represents basic functionality expected by dbt users. For example, if your adapter does not support incremental models, you should disable the test, [by marking it with `skip` or `xfail`](https://docs.pytest.org/en/latest/how-to/skipping.html), as well as noting that limitation in any documentation, READMEs, and usage guides that accompany your adapter.
-
-2. **Optional tests**, for second-order functionality that is common across plugins, but not required for basic use. Your plugin can opt into these test cases by inheriting existing ones, or reimplementing them with adjustments. For now, this category includes all tests located outside the `basic` subdirectory. More tests will be added as we convert older tests defined on dbt-core and mature plugins to use the standard framework.
-
-3. **Custom tests**, for behavior that is specific to your adapter / data platform. Each has its own specialties and idiosyncracies. We encourage you to use the same `pytest`-based framework, utilities, and fixtures to write your own custom tests for functionality that is unique to your adapter.
-
-If you run into an issue with the core framework, or the basic/optional test cases—or if you've written a custom test that you believe would be relevant and useful for other adapter plugin developers—please open an issue or PR in the `dbt-core` repository on GitHub.
-
-## Getting started running basic tests
-
-In this section, we'll walk through the three steps to start running our basic test cases on your adapter plugin:
-
-1. Install dependencies
-2. Set up and configure pytest
-3. Define test cases
-
-### Install dependencies
-
-You should already have a virtual environment with `dbt-core` and your adapter plugin installed. You'll also need to install:
-- [`pytest`](https://pypi.org/project/pytest/)
-- [`dbt-tests-adapter`](https://pypi.org/project/dbt-tests-adapter/), the set of common test cases
-- (optional) [`pytest` plugins](https://docs.pytest.org/en/7.0.x/reference/plugin_list.html)--we'll use `pytest-dotenv` below
-
-Or specify all dependencies in a requirements file like:
-
-
-```txt
-pytest
-pytest-dotenv
-dbt-tests-adapter
-```
-
-
-```sh
-pip install -r dev_requirements.txt
-```
-
-### Set up and configure pytest
-
-First, set yourself up to run `pytest` by creating a file named `pytest.ini` at the root of your repository:
-
-
-
-```python
-[pytest]
-filterwarnings =
- ignore:.*'soft_unicode' has been renamed to 'soft_str'*:DeprecationWarning
- ignore:unclosed file .*:ResourceWarning
-env_files =
- test.env # uses pytest-dotenv plugin
- # this allows you to store env vars for database connection in a file named test.env
- # rather than passing them in every CLI command, or setting in `PYTEST_ADDOPTS`
- # be sure to add "test.env" to .gitignore as well!
-testpaths =
- tests/functional # name per convention
-```
-
-
-
-Then, create a configuration file within your tests directory. In it, you'll want to define all necessary profile configuration for connecting to your data platform in local development and continuous integration. We recommend setting these values with environment variables, since this file will be checked into version control.
-
-
-
-```python
-import pytest
-import os
-
-# Import the standard functional fixtures as a plugin
-# Note: fixtures with session scope need to be local
-pytest_plugins = ["dbt.tests.fixtures.project"]
-
-# The profile dictionary, used to write out profiles.yml
-# dbt will supply a unique schema per test, so we do not specify 'schema' here
-@pytest.fixture(scope="class")
-def dbt_profile_target():
- return {
- 'type': '',
- 'threads': 1,
- 'host': os.getenv('HOST_ENV_VAR_NAME'),
- 'user': os.getenv('USER_ENV_VAR_NAME'),
- ...
- }
-```
-
-
-
-### Define test cases
-
-As in the example above, each test case is defined as a class, and has its own "project" setup. To get started, you can import all basic test cases and try running them without changes.
-
-
-
-```python
-import pytest
-
-from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations
-from dbt.tests.adapter.basic.test_singular_tests import BaseSingularTests
-from dbt.tests.adapter.basic.test_singular_tests_ephemeral import BaseSingularTestsEphemeral
-from dbt.tests.adapter.basic.test_empty import BaseEmpty
-from dbt.tests.adapter.basic.test_ephemeral import BaseEphemeral
-from dbt.tests.adapter.basic.test_incremental import BaseIncremental
-from dbt.tests.adapter.basic.test_generic_tests import BaseGenericTests
-from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols
-from dbt.tests.adapter.basic.test_snapshot_timestamp import BaseSnapshotTimestamp
-from dbt.tests.adapter.basic.test_adapter_methods import BaseAdapterMethod
-
-class TestSimpleMaterializationsMyAdapter(BaseSimpleMaterializations):
- pass
-
-
-class TestSingularTestsMyAdapter(BaseSingularTests):
- pass
-
-
-class TestSingularTestsEphemeralMyAdapter(BaseSingularTestsEphemeral):
- pass
-
-
-class TestEmptyMyAdapter(BaseEmpty):
- pass
-
-
-class TestEphemeralMyAdapter(BaseEphemeral):
- pass
-
-
-class TestIncrementalMyAdapter(BaseIncremental):
- pass
-
-
-class TestGenericTestsMyAdapter(BaseGenericTests):
- pass
-
-
-class TestSnapshotCheckColsMyAdapter(BaseSnapshotCheckCols):
- pass
-
-
-class TestSnapshotTimestampMyAdapter(BaseSnapshotTimestamp):
- pass
-
-
-class TestBaseAdapterMethod(BaseAdapterMethod):
- pass
-```
-
-
-
-
-Finally, run pytest:
-```sh
-python3 -m pytest tests/functional
-```
-
-### Modifying test cases
-
-You may need to make slight modifications in a specific test case to get it passing on your adapter. The mechanism to do this is simple: rather than simply inheriting the "base" test with `pass`, you can redefine any of its fixtures or test methods.
-
-For instance, on Redshift, we need to explicitly cast a column in the fixture input seed to use data type `varchar(64)`:
-
-
-
-```python
-import pytest
-from dbt.tests.adapter.basic.files import seeds_base_csv, seeds_added_csv, seeds_newcolumns_csv
-from dbt.tests.adapter.basic.test_snapshot_check_cols import BaseSnapshotCheckCols
-
-# set the datatype of the name column in the 'added' seed so it
-# can hold the '_update' that's added
-schema_seed_added_yml = """
-version: 2
-seeds:
- - name: added
- config:
- column_types:
- name: varchar(64)
-"""
-
-class TestSnapshotCheckColsRedshift(BaseSnapshotCheckCols):
- # Redshift defines the 'name' column such that it's not big enough
- # to hold the '_update' added in the test.
- @pytest.fixture(scope="class")
- def models(self):
- return {
- "base.csv": seeds_base_csv,
- "added.csv": seeds_added_csv,
- "seeds.yml": schema_seed_added_yml,
- }
-```
-
-
-
-As another example, the `dbt-bigquery` adapter asks users to "authorize" replacing a with a by supplying the `--full-refresh` flag. The reason: In the table logic, a view by the same name must first be dropped; if the table query fails, the model will be missing.
-
-Knowing this possibility, the "base" test case offers a `require_full_refresh` switch on the `test_config` fixture class. For BigQuery, we'll switch it on:
-
-
-
-```python
-import pytest
-from dbt.tests.adapter.basic.test_base import BaseSimpleMaterializations
-
-class TestSimpleMaterializationsBigQuery(BaseSimpleMaterializations):
- @pytest.fixture(scope="class")
- def test_config(self):
- # effect: add '--full-refresh' flag in requisite 'dbt run' step
- return {"require_full_refresh": True}
-```
-
-
-
-It's always worth asking whether the required modifications represent gaps in perceived or expected dbt functionality. Are these simple implementation details, which any user of this database would understand? Are they limitations worth documenting?
-
-If, on the other hand, they represent poor assumptions in the "basic" test cases, which fail to account for a common pattern in other types of databases-—please open an issue or PR in the `dbt-core` repository on GitHub.
-
-### Running with multiple profiles
-
-Some databases support multiple connection methods, which map to actually different functionality behind the scenes. For instance, the `dbt-spark` adapter supports connections to Apache Spark clusters _and_ Databricks runtimes, which supports additional functionality out of the box, enabled by the Delta file format.
-
-
-
-```python
-def pytest_addoption(parser):
- parser.addoption("--profile", action="store", default="apache_spark", type=str)
-
-
-# Using @pytest.mark.skip_profile('apache_spark') uses the 'skip_by_profile_type'
-# autouse fixture below
-def pytest_configure(config):
- config.addinivalue_line(
- "markers",
- "skip_profile(profile): skip test for the given profile",
- )
-
-@pytest.fixture(scope="session")
-def dbt_profile_target(request):
- profile_type = request.config.getoption("--profile")
- elif profile_type == "databricks_sql_endpoint":
- target = databricks_sql_endpoint_target()
- elif profile_type == "apache_spark":
- target = apache_spark_target()
- else:
- raise ValueError(f"Invalid profile type '{profile_type}'")
- return target
-
-def apache_spark_target():
- return {
- "type": "spark",
- "host": "localhost",
- ...
- }
-
-def databricks_sql_endpoint_target():
- return {
- "type": "spark",
- "host": os.getenv("DBT_DATABRICKS_HOST_NAME"),
- ...
- }
-
-@pytest.fixture(autouse=True)
-def skip_by_profile_type(request):
- profile_type = request.config.getoption("--profile")
- if request.node.get_closest_marker("skip_profile"):
- for skip_profile_type in request.node.get_closest_marker("skip_profile").args:
- if skip_profile_type == profile_type:
- pytest.skip("skipped on '{profile_type}' profile")
-```
-
-
-
-If there are tests that _shouldn't_ run for a given profile:
-
-
-
-```python
-# Snapshots require access to the Delta file format, available on our Databricks connection,
-# so let's skip on Apache Spark
-@pytest.mark.skip_profile('apache_spark')
-class TestSnapshotCheckColsSpark(BaseSnapshotCheckCols):
- @pytest.fixture(scope="class")
- def project_config_update(self):
- return {
- "seeds": {
- "+file_format": "delta",
- },
- "snapshots": {
- "+file_format": "delta",
- }
- }
-```
-
-
-
-Finally:
-```sh
-python3 -m pytest tests/functional --profile apache_spark
-python3 -m pytest tests/functional --profile databricks_sql_endpoint
-```
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md
deleted file mode 100644
index f8335dfcbc4..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md
+++ /dev/null
@@ -1,65 +0,0 @@
----
-title: "Documenting a new adapter"
-id: "5-documenting-a-new-adapter"
----
-
-If you've already [built](3-building-a-new-adapter), and [tested](4-testing-a-new-adapter) your adapter, it's time to document it so the dbt community will know that it exists and how to use it.
-
-## Making your adapter available
-
-Many community members maintain their adapter plugins under open source licenses. If you're interested in doing this, we recommend:
-- Hosting on a public git provider (for example, GitHub or Gitlab)
-- Publishing to [PyPI](https://pypi.org/)
-- Adding to the list of ["Supported Data Platforms"](/docs/supported-data-platforms#community-supported) (more info below)
-
-## General Guidelines
-
-To best inform the dbt community of the new adapter, you should contribute to the dbt's open-source documentation site, which uses the [Docusaurus project](https://docusaurus.io/). This is the site you're currently on!
-
-### Conventions
-
-Each `.md` file you create needs a header as shown below. The document id will also need to be added to the config file: `website/sidebars.js`.
-
-```md
----
-title: "Documenting a new adapter"
-id: "documenting-a-new-adapter"
----
-```
-
-### Single Source of Truth
-
-We ask our adapter maintainers to use the [docs.getdbt.com repo](https://github.com/dbt-labs/docs.getdbt.com) (i.e. this site) as the single-source-of-truth for documentation rather than having to maintain the same set of information in three different places. The adapter repo's `README.md` and the data platform's documentation pages should simply link to the corresponding page on this docs site. Keep reading for more information on what should and shouldn't be included on the dbt docs site.
-
-### Assumed Knowledge
-
-To simplify things, assume the reader of this documentation already knows how both dbt and your data platform works. There's already great material for how to learn dbt and the data platform out there. The documentation we're asking you to add should be what a user who is already profiecient in both dbt and your data platform would need to know in order to use both. Effectively that boils down to two things: how to connect, and how to configure.
-
-
-## Topics and Pages to Cover
-
-
-The following subjects need to be addressed across three pages of this docs site to have your data platform be listed on our documentation. After the corresponding pull request is merged, we ask that you link to these pages from your adapter repo's `REAMDE` as well as from your product documentation.
-
- To contribute, all you will have to do make the changes listed in the table below.
-
-
-
-
-| How To... | File to change within `/website/docs/` | Action | Info to Include |
-|----------------------|--------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| Connect | `/docs/core/connect-data-platform/{MY-DATA-PLATFORM}-setup.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. |
-| Configure | `reference/resource-configs/{MY-DATA-PLATFORM}-configs.md` | Create | What options and configuration specific to your data platform do users need to know? e.g. table distribution and indexing options, column_quoting policy, which incremental strategies are supported |
-| Discover and Install | `docs/supported-data-platforms.md` | Modify | Is it a vendor- or community- supported adapter? How to install Python adapter package? Ideally with pip and PyPI hosted package, but can also use `git+` link to GitHub Repo |
-| Add link to sidebar | `website/sidebars.js` | Modify | Add the document id to the correct location in the sidebar menu |
-
-For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-setup.md` and add it to the `/website/docs/core/connect-data-platform/` directory.
-
-
-## Example PRs to add new adapter documentation
-
-Below are some recent pull requests made by partners to document their data platform's adapter:
-
-- [TiDB](https://github.com/dbt-labs/docs.getdbt.com/pull/1309)
-- [SingleStore](https://github.com/dbt-labs/docs.getdbt.com/pull/1044)
-- [Firebolt](https://github.com/dbt-labs/docs.getdbt.com/pull/941)
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md
deleted file mode 100644
index 9bf2f949bef..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md
+++ /dev/null
@@ -1,120 +0,0 @@
----
-title: "Promoting a new adapter"
-id: "6-promoting-a-new-adapter"
----
-
-## Model for engagement in the dbt community
-
-The most important thing here is recognizing that people are successful in the community when they join, first and foremost, to engage authentically.
-
-What does authentic engagement look like? It’s challenging to define explicit rules. One good rule of thumb is to treat people with dignity and respect.
-
-Contributors to the community should think of contribution *as the end itself,* not a means toward other business KPIs (leads, community members, etc.). [We are a mission-driven company.](https://www.getdbt.com/dbt-labs/values/) Some ways to know if you’re authentically engaging:
-
-- Is an engagement’s *primary* purpose of sharing knowledge and resources or building brand engagement?
-- Imagine you didn’t work at the org you do — can you imagine yourself still writing this?
-- Is it written in formal / marketing language, or does it sound like you, the human?
-
-## Who should join the dbt community slack
-
-### People who have insight into what it means to do hands-on [analytics engineering](https://www.getdbt.com/analytics-engineering/) work
-
-The dbt Community Slack workspace is fundamentally a place for analytics practitioners to interact with each other — the closer the users are in the community to actual data/analytics engineering work, the more natural their engagement will be (leading to better outcomes for partners and the community).
-
-### DevRel practitioners with strong focus
-
-DevRel practitioners often have a strong analytics background and a good understanding of the community. It’s essential to be sure they are focused on *contributing,* not on driving community metrics for partner org (such as signing people up for their slack or events). The metrics will rise naturally through authentic engagement.
-
-### Founder and executives who are interested in directly engaging with the community
-
-This is either incredibly successful or not at all depending on the profile of the founder. Typically, this works best when the founder has a practitioner-level of technical understanding and is interested in joining not to promote, but to learn and hear from users.
-
-### Software Engineers at partner products that are building and supporting integrations with either dbt Core or dbt Cloud
-
-This is successful when the engineers are familiar with dbt as a product or at least have taken our training course. The Slack is often a place where end-user questions and feedback is initially shared, so it is recommended that someone technical from the team be present. There are also a handful of channels aimed at those building integrations, which tend to be a font of knowledge.
-
-### Who might struggle in the dbt community
-#### People in marketing roles
-dbt Slack is not a marketing channel. Attempts to use it as such invariably fall flat and can even lead to people having a negative view of a product. This doesn’t mean that dbt can’t serve marketing objectives, but a long-term commitment to engagement is the only proven method to do this sustainably.
-
-#### People in product roles
-The dbt Community can be an invaluable source of feedback on a product. There are two primary ways this can happen — organically (community members proactively suggesting a new feature) and via direct calls for feedback and user research. Immediate calls for engagement must be done in your dedicated #tools channel. Direct calls should be used sparingly, as they can overwhelm more organic discussions and feedback.
-
-## Who is the audience for an adapter release
-
-A new adapter is likely to drive huge community interest from several groups of people:
-- People who are currently using the database that the adapter is supporting
-- People who may be adopting the database in the near future.
-- People who are interested in dbt development in general.
-
-The database users will be your primary audience and the most helpful in achieving success. Engage them directly in the adapter’s dedicated Slack channel. If one does not exist already, reach out in #channel-requests, and we will get one made for you and include it in an announcement about new channels.
-
-The final group is where non-slack community engagement becomes important. Twitter and LinkedIn are both great places to interact with a broad audience. A well-orchestrated adapter release can generate impactful and authentic engagement.
-
-## How to message the initial rollout and follow-up content
-
-Tell a story that engages dbt users and the community. Highlight new use cases and functionality unlocked by the adapter in a way that will resonate with each segment.
-
-### Existing users of your technology who are new to dbt
-
-- Provide a general overview of the value dbt will deliver to your users. This can lean on dbt's messaging and talking points which are laid out in the [dbt viewpoint.](/community/resources/viewpoint)
- - Give examples of a rollout that speaks to the overall value of dbt and your product.
-
-### Users who are already familiar with dbt and the community
-- Consider unique use cases or advantages your adapter provide over existing adapters. Who will be excited for this?
-- Contribute to the dbt Community and ensure that dbt users on your adapter are well supported (tutorial content, packages, documentation, etc).
-- Example of a rollout that is compelling for those familiar with dbt: [Firebolt](https://www.linkedin.com/feed/update/urn:li:activity:6879090752459182080/)
-
-## Tactically manage distribution of content about new or existing adapters
-
-There are tactical pieces on how and where to share that help ensure success.
-
-### On slack:
-- #i-made-this channel — this channel has a policy against “marketing” and “content marketing” posts, but it should be successful if you write your content with the above guidelines in mind. Even with that, it’s important to post here sparingly.
-- Your own database / tool channel — this is where the people who have opted in to receive communications from you and always a great place to share things that are relevant to them.
-
-### On social media:
-- Twitter
-- LinkedIn
-- Social media posts *from the author* or an individual connected to the project tend to have better engagement than posts from a company or organization account.
-- Ask your partner representative about:
- - Retweets and shares from the official dbt Labs accounts.
- - Flagging posts internally at dbt Labs to get individual employees to share.
-
-## Measuring engagement
-
-You don’t need 1000 people in a channel to succeed, but you need at least a few active participants who can make it feel lived in. If you’re comfortable working in public, this could be members of your team, or it can be a few people who you know that are highly engaged and would be interested in participating. Having even 2 or 3 regulars hanging out in a channel is all that’s needed for a successful start and is, in fact, much more impactful than 250 people that never post.
-
-## How to announce a new adapter
-
-We’d recommend *against* boilerplate announcements and encourage finding a unique voice. That being said, there are a couple of things that we’d want to include:
-
-- A summary of the value prop of your database / technology for users who aren’t familiar.
-- The personas that might be interested in this news.
-- A description of what the adapter *is*. For example:
- > With the release of our new dbt adapter, you’ll be able to to use dbt to model and transform your data in [name-of-your-org]
-- Particular or unique use cases or functionality unlocked by the adapter.
-- Plans for future / ongoing support / development.
-- The link to the documentation for using the adapter on the dbt Labs docs site.
-- An announcement blog.
-
-## Announcing new release versions of existing adapters
-
-This can vary substantially depending on the nature of the release but a good baseline is the types of release messages that [we put out in the #dbt-releases](https://getdbt.slack.com/archives/C37J8BQEL/p1651242161526509) channel.
-
-![Full Release Post](/img/adapter-guide/0-full-release-notes.png)
-
-Breaking this down:
-
-- Visually distinctive announcement - make it clear this is a release
-
-- Short written description of what is in the release
-
-- Links to additional resources
-
-- Implementation instructions:
-
-- Future plans
-
-- Contributor recognition (if applicable)
-
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md
deleted file mode 100644
index 6310569dfad..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md
+++ /dev/null
@@ -1,41 +0,0 @@
----
-title: "Verifying a new adapter"
-id: "7-verifying-a-new-adapter"
----
-
-## Why verify an adapter?
-
-The very first data platform dbt supported was Redshift followed quickly by Postgres (([dbt-core#174](https://github.com/dbt-labs/dbt-core/pull/174)). In 2017, back when dbt Labs (née Fishtown Analytics) was still a data consultancy, we added support for Snowflake and BigQuery. We also turned dbt's database support into an adapter framework ([dbt-core#259](https://github.com/dbt-labs/dbt-core/pull/259/)), and a plugin system a few years later. For years, dbt Labs specialized in those four data platforms and became experts in them. However, the surface area of all possible databases, their respective nuances, and keeping them up-to-date and bug-free is a Herculean and/or Sisyphean task that couldn't be done by a single person or even a single team! Enter the dbt community which enables dbt Core to work on more than 30 different databases (32 as of Sep '22)!
-
-Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a *good thing*, however it requires due dilligence that wasn't required in a paid-license, closed-source software world. Before taking a dependency on an open-source projet is is important to determine the answer to the following questions:
-
-1. Does it work?
-2. Does it meet my team's specific use case?
-3. Does anyone "own" the code, or is anyone liable for ensuring it works?
-4. Do bugs get fixed quickly?
-5. Does it stay up-to-date with new Core features?
-6. Is the usage substantial enough to self-sustain?
-7. What risks do I take on by taking a dependency on this library?
-
-These are valid, important questions to answer—especially given that `dbt-core` itself only put out its first stable release (major version v1.0) in December 2021! Indeed, up until now, the majority of new user questions in database-specific channels are some form of:
-- "How mature is `dbt-`? Any gotchas I should be aware of before I start exploring?"
-- "has anyone here used `dbt-` for production models?"
-- "I've been playing with `dbt-` -- I was able to install and run my initial experiments. I noticed that there are certain features mentioned on the documentation that are marked as 'not ok' or 'not tested'. What are the risks?
-I'd love to make a statement on my team to adopt DBT [sic], but I'm pretty sure questions will be asked around the possible limitations of the adapter or if there are other companies out there using dbt [sic] with Oracle DB in production, etc."
-
-There has been a tendency to trust the dbt Labs-maintained adapters over community- and vendor-supported adapters, but repo ownership is only one among many indicators of software quality. We aim to help our users feel well-informed as to the caliber of an adapter with a new program.
-
-## Verified by dbt Labs
-
-The adapter verification program aims to quickly indicate to users which adapters can be trusted to use in production. Previously, doing so was uncharted territory for new users and complicated making the business case to their leadership team. We plan to give quality assurances by:
-1. appointing a key stakeholder for the adapter repository,
-2. ensuring that the chosen stakeholder fixes bugs and cuts new releases in a timely manner see maintainer your adapter (["Maintaining your new adapter"](2-prerequisites-for-a-new-adapter#maintaining-your-new-adapter)),
-3. demonstrating that it passes our adapter pytest suite tests,
-4. assuring that it works for us internally and ideally an existing team using the adapter in production .
-
-
-Every major & minor version of a adapter will be verified internally and given an official :white_check_mark: (custom emoji coming soon), on the ["Supported Data Platforms"](/docs/supported-data-platforms) page.
-
-## How to get an adapter verified?
-
-We envision that data platform vendors will be most interested in having their adapter versions verified, however we are open to community adapter verification. If interested, please reach out either to the `partnerships` at `dbtlabs.com` or post in the [#adapter-ecosystem Slack channel](https://getdbt.slack.com/archives/C030A0UF5LM).
diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development b/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development
deleted file mode 100644
index 8b137891791..00000000000
--- a/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md
deleted file mode 100644
index b03cb2ca013..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md
+++ /dev/null
@@ -1,38 +0,0 @@
----
-title: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake"
-id: "1-overview-dbt-python-snowpark"
-description: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake"
----
-
-The focus of this workshop will be to demonstrate how we can use both *SQL and python together* in the same workflow to run *both analytics and machine learning models* on dbt Cloud.
-
-All code in today’s workshop can be found on [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1).
-
-## What you'll use during the lab
-
-- A [Snowflake account](https://trial.snowflake.com/) with ACCOUNTADMIN access
-- A [dbt Cloud account](https://www.getdbt.com/signup/)
-
-## What you'll learn
-
-- How to build scalable data transformation pipelines using dbt, and Snowflake using SQL and Python
-- How to leverage copying data into Snowflake from a public S3 bucket
-
-## What you need to know
-
-- Basic to intermediate SQL and python.
-- Basic understanding of dbt fundamentals. We recommend the [dbt Fundamentals course](https://courses.getdbt.com/collections) if you're interested.
-- High level machine learning process (encoding, training, testing)
-- Simple ML algorithms — we will use logistic regression to keep the focus on the *workflow*, not algorithms!
-
-## What you'll build
-
-- A set of data analytics and prediction pipelines using Formula 1 data leveraging dbt and Snowflake, making use of best practices like data quality tests and code promotion between environments
-- We will create insights for:
- 1. Finding the lap time average and rolling average through the years (is it generally trending up or down)?
- 2. Which constructor has the fastest pit stops in 2021?
- 3. Predicting the position of each driver given using a decade of data (2010 - 2020)
-
-As inputs, we are going to leverage Formula 1 datasets hosted on a dbt Labs public S3 bucket. We will create a Snowflake Stage for our CSV files then use Snowflake’s `COPY INTO` function to copy the data in from our CSV files into tables. The Formula 1 is available on [Kaggle](https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020). The data is originally compiled from the [Ergast Developer API](http://ergast.com/mrd/).
-
-Overall we are going to set up the environments, build scalable pipelines in dbt, establish data tests, and promote code to production.
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md
deleted file mode 100644
index 446981214e3..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md
+++ /dev/null
@@ -1,150 +0,0 @@
----
-title: "Python transformations!"
-id: "10-python-transformations"
-description: "Python transformations"
----
-
-Up until now, SQL has been driving the project (car pun intended) for data cleaning and hierarchical joining. Now it’s time for Python to take the wheel (car pun still intended) for the rest of our lab! For more information about running Python models on dbt, check out our [docs](/docs/build/python-models). To learn more about dbt python works under the hood, check out [Snowpark for Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html), which makes running dbt Python models possible.
-
-There are quite a few differences between SQL and Python in terms of the dbt syntax and DDL, so we’ll be breaking our code and model runs down further for our python models.
-
-## Pit stop analysis
-
-First, we want to find out: which constructor had the fastest pit stops in 2021? (constructor is a Formula 1 team that builds or “constructs” the car).
-
-1. Create a new file called `fastest_pit_stops_by_constructor.py` in our `aggregates` (this is the first time we are using the `.py` extension!).
-2. Copy the following code into the file:
- ```python
- import numpy as np
- import pandas as pd
-
- def model(dbt, session):
- # dbt configuration
- dbt.config(packages=["pandas","numpy"])
-
- # get upstream data
- pit_stops_joined = dbt.ref("pit_stops_joined").to_pandas()
-
- # provide year so we do not hardcode dates
- year=2021
-
- # describe the data
- pit_stops_joined["PIT_STOP_SECONDS"] = pit_stops_joined["PIT_STOP_MILLISECONDS"]/1000
- fastest_pit_stops = pit_stops_joined[(pit_stops_joined["RACE_YEAR"]==year)].groupby(by="CONSTRUCTOR_NAME")["PIT_STOP_SECONDS"].describe().sort_values(by='mean')
- fastest_pit_stops.reset_index(inplace=True)
- fastest_pit_stops.columns = fastest_pit_stops.columns.str.upper()
-
- return fastest_pit_stops.round(2)
- ```
-
-3. Let’s break down what this code is doing step by step:
- - First, we are importing the Python libraries that we are using. A *library* is a reusable chunk of code that someone else wrote that you may want to include in your programs/projects. We are using `numpy` and `pandas`in this Python model. This is similar to a dbt *package*, but our Python libraries do *not* persist across the entire project.
- - Defining a function called `model` with the parameter `dbt` and `session`. The parameter `dbt` is a class compiled by dbt, which enables you to run your Python code in the context of your dbt project and DAG. The parameter `session` is a class representing your Snowflake’s connection to the Python backend. The `model` function *must return a single DataFrame*. You can see that all the data transformation happening is within the body of the `model` function that the `return` statement is tied to.
- - Then, within the context of our dbt model library, we are passing in a configuration of which packages we need using `dbt.config(packages=["pandas","numpy"])`.
- - Use the `.ref()` function to retrieve the data frame `pit_stops_joined` that we created in our last step using SQL. We cast this to a pandas dataframe (by default it's a Snowpark Dataframe).
- - Create a variable named `year` so we aren’t passing a hardcoded value.
- - Generate a new column called `PIT_STOP_SECONDS` by dividing the value of `PIT_STOP_MILLISECONDS` by 1000.
- - Create our final data frame `fastest_pit_stops` that holds the records where year is equal to our year variable (2021 in this case), then group the data frame by `CONSTRUCTOR_NAME` and use the `describe()` and `sort_values()` and in descending order. This will make our first row in the new aggregated data frame the team with the fastest pit stops over an entire competition year.
- - Finally, it resets the index of the `fastest_pit_stops` data frame. The `reset_index()` method allows you to reset the index back to the default 0, 1, 2, etc indexes. By default, this method will keep the "old" indexes in a column named "index"; to avoid this, use the drop parameter. Think of this as keeping your data “flat and square” as opposed to “tiered”. If you are new to Python, now might be a good time to [learn about indexes for 5 minutes](https://towardsdatascience.com/the-basics-of-indexing-and-slicing-python-lists-2d12c90a94cf) since it's the foundation of how Python retrieves, slices, and dices data. The `inplace` argument means we override the existing data frame permanently. Not to fear! This is what we want to do to avoid dealing with multi-indexed dataframes!
- - Convert our Python column names to all uppercase using `.upper()`, so Snowflake recognizes them.
- - Finally we are returning our dataframe with 2 decimal places for all the columns using the `round()` method.
-4. Zooming out a bit, what are we doing differently here in Python from our typical SQL code:
- - Method chaining is a technique in which multiple methods are called on an object in a single statement, with each method call modifying the result of the previous one. The methods are called in a chain, with the output of one method being used as the input for the next one. The technique is used to simplify the code and make it more readable by eliminating the need for intermediate variables to store the intermediate results.
- - The way you see method chaining in Python is the syntax `.().()`. For example, `.describe().sort_values(by='mean')` where the `.describe()` method is chained to `.sort_values()`.
- - The `.describe()` method is used to generate various summary statistics of the dataset. It's used on pandas dataframe. It gives a quick and easy way to get the summary statistics of your dataset without writing multiple lines of code.
- - The `.sort_values()` method is used to sort a pandas dataframe or a series by one or multiple columns. The method sorts the data by the specified column(s) in ascending or descending order. It is the pandas equivalent to `order by` in SQL.
-
- We won’t go as in depth for our subsequent scripts, but will continue to explain at a high level what new libraries, functions, and methods are doing.
-
-5. Build the model using the UI which will **execute**:
- ```bash
- dbt run --select fastest_pit_stops_by_constructor
- ```
- in the command bar.
-
- Let’s look at some details of our first Python model to see what our model executed. There two major differences we can see while running a Python model compared to an SQL model:
-
- - Our Python model was executed as a stored procedure. Snowflake needs a way to know that it's meant to execute this code in a Python runtime, instead of interpreting in a SQL runtime. We do this by creating a Python stored proc, called by a SQL command.
- - The `snowflake-snowpark-python` library has been picked up to execute our Python code. Even though this wasn’t explicitly stated this is picked up by the dbt class object because we need our Snowpark package to run Python!
-
- Python models take a bit longer to run than SQL models, however we could always speed this up by using [Snowpark-optimized Warehouses](https://docs.snowflake.com/en/user-guide/warehouses-snowpark-optimized.html) if we wanted to. Our data is sufficiently small, so we won’t worry about creating a separate warehouse for Python versus SQL files today.
-
-
- The rest of our **Details** output gives us information about how dbt and Snowpark for Python are working together to define class objects and apply a specific set of methods to run our models.
-
- So which constructor had the fastest pit stops in 2021? Let’s look at our data to find out!
-
-6. We can't preview Python models directly, so let’s create a new file using the **+** button or the Control-n shortcut to create a new scratchpad.
-7. Reference our Python model:
- ```sql
- select * from {{ ref('fastest_pit_stops_by_constructor') }}
- ```
- and preview the output:
-
-
- Not only did Red Bull have the fastest average pit stops by nearly 40 seconds, they also had the smallest standard deviation, meaning they are both fastest and most consistent teams in pit stops. By using the `.describe()` method we were able to avoid verbose SQL requiring us to create a line of code per column and repetitively use the `PERCENTILE_COUNT()` function.
-
- Now we want to find the lap time average and rolling average through the years (is it generally trending up or down)?
-
-8. Create a new file called `lap_times_moving_avg.py` in our `aggregates` folder.
-9. Copy the following code into the file:
- ```python
- import pandas as pd
-
- def model(dbt, session):
- # dbt configuration
- dbt.config(packages=["pandas"])
-
- # get upstream data
- lap_times = dbt.ref("int_lap_times_years").to_pandas()
-
- # describe the data
- lap_times["LAP_TIME_SECONDS"] = lap_times["LAP_TIME_MILLISECONDS"]/1000
- lap_time_trends = lap_times.groupby(by="RACE_YEAR")["LAP_TIME_SECONDS"].mean().to_frame()
- lap_time_trends.reset_index(inplace=True)
- lap_time_trends["LAP_MOVING_AVG_5_YEARS"] = lap_time_trends["LAP_TIME_SECONDS"].rolling(5).mean()
- lap_time_trends.columns = lap_time_trends.columns.str.upper()
-
- return lap_time_trends.round(1)
- ```
-
-10. Breaking down our code a bit:
- - We’re only using the `pandas` library for this model and casting it to a pandas data frame `.to_pandas()`.
- - Generate a new column called `LAP_TIMES_SECONDS` by dividing the value of `LAP_TIME_MILLISECONDS` by 1000.
- - Create the final dataframe. Get the lap time per year. Calculate the mean series and convert to a data frame.
- - Reset the index.
- - Calculate the rolling 5 year mean.
- - Round our numeric columns to one decimal place.
-11. Now, run this model by using the UI **Run model** or
- ```bash
- dbt run --select lap_times_moving_avg
- ```
- in the command bar.
-
-12. Once again previewing the output of our data using the same steps for our `fastest_pit_stops_by_constructor` model.
-
-
- We can see that it looks like lap times are getting consistently faster over time. Then in 2010 we see an increase occur! Using outside subject matter context, we know that significant rule changes were introduced to Formula 1 in 2010 and 2011 causing slower lap times.
-
-13. Now is a good time to checkpoint and commit our work to Git. Click **Commit and push** and give your commit a message like `aggregate python models` before moving on.
-
-## The dbt model, .source(), .ref() and .config() functions
-
-Let’s take a step back before starting machine learning to both review and go more in-depth at the methods that make running dbt python models possible. If you want to know more outside of this lab’s explanation read the documentation [here](/docs/build/python-models?version=1.3).
-
-- dbt model(dbt, session). For starters, each Python model lives in a .py file in your models/ folder. It defines a function named `model()`, which takes two parameters:
- - dbt — A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG.
- - session — A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames and to write DataFrames back to tables. In PySpark, by convention, the SparkSession is named spark, and available globally. For consistency across platforms, we always pass it into the model function as an explicit argument called session.
-- The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame.
-- `.source()` and `.ref()` functions. Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. If you want to read directly from a raw source table, use `dbt.source()`. We saw this in our earlier section using SQL with the source function. These functions have the same execution, but with different syntax. Use the `dbt.ref()` method within a Python model to read data from other models (SQL or Python). These methods return DataFrames pointing to the upstream source, model, seed, or snapshot.
-- `.config()`. Just like SQL models, there are three ways to configure Python models:
- - In a dedicated `.yml` file, within the `models/` directory
- - Within the model's `.py` file, using the `dbt.config()` method
- - Calling the `dbt.config()` method will set configurations for your model within your `.py` file, similar to the `{{ config() }} macro` in `.sql` model files:
- ```python
- def model(dbt, session):
-
- # setting configuration
- dbt.config(materialized="table")
- ```
- - There's a limit to how complex you can get with the `dbt.config()` method. It accepts only literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `.config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the config property in a [YAML file](/reference/resource-properties/config). Learn more about configurations [here](/reference/model-configs).
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md
deleted file mode 100644
index bde163b59db..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md
+++ /dev/null
@@ -1,225 +0,0 @@
----
-title: "Machine Learning prep: cleaning, encoding, and splits, oh my!"
-id: "11-machine-learning-prep"
-description: "Machine Learning prep"
----
-Now that we’ve gained insights and business intelligence about Formula 1 at a descriptive level, we want to extend our capabilities into prediction. We’re going to take the scenario where we censor the data. This means that we will pretend that we will train a model using earlier data and apply it to future data. In practice, this means we’ll take data from 2010-2019 to train our model and then predict 2020 data.
-
-In this section, we’ll be preparing our data to predict the final race position of a driver.
-
-At a high level we’ll be:
-
-- Creating new prediction features and filtering our dataset to active drivers
-- Encoding our data (algorithms like numbers) and simplifying our target variable called `position`
-- Splitting our dataset into training, testing, and validation
-
-## ML data prep
-
-1. To keep our project organized, we’ll need to create two new subfolders in our `ml` directory. Under the `ml` folder, make the subfolders `prep` and `train_predict`.
-2. Create a new file under `ml/prep` called `ml_data_prep`. Copy the following code into the file and **Save**.
- ```python
- import pandas as pd
-
- def model(dbt, session):
- # dbt configuration
- dbt.config(packages=["pandas"])
-
- # get upstream data
- fct_results = dbt.ref("fct_results").to_pandas()
-
- # provide years so we do not hardcode dates in filter command
- start_year=2010
- end_year=2020
-
- # describe the data for a full decade
- data = fct_results.loc[fct_results['RACE_YEAR'].between(start_year, end_year)]
-
- # convert string to an integer
- data['POSITION'] = data['POSITION'].astype(float)
-
- # we cannot have nulls if we want to use total pit stops
- data['TOTAL_PIT_STOPS_PER_RACE'] = data['TOTAL_PIT_STOPS_PER_RACE'].fillna(0)
-
- # some of the constructors changed their name over the year so replacing old names with current name
- mapping = {'Force India': 'Racing Point', 'Sauber': 'Alfa Romeo', 'Lotus F1': 'Renault', 'Toro Rosso': 'AlphaTauri'}
- data['CONSTRUCTOR_NAME'].replace(mapping, inplace=True)
-
- # create confidence metrics for drivers and constructors
- dnf_by_driver = data.groupby('DRIVER').sum()['DNF_FLAG']
- driver_race_entered = data.groupby('DRIVER').count()['DNF_FLAG']
- driver_dnf_ratio = (dnf_by_driver/driver_race_entered)
- driver_confidence = 1-driver_dnf_ratio
- driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence))
-
- dnf_by_constructor = data.groupby('CONSTRUCTOR_NAME').sum()['DNF_FLAG']
- constructor_race_entered = data.groupby('CONSTRUCTOR_NAME').count()['DNF_FLAG']
- constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered)
- constructor_relaiblity = 1-constructor_dnf_ratio
- constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity))
-
- data['DRIVER_CONFIDENCE'] = data['DRIVER'].apply(lambda x:driver_confidence_dict[x])
- data['CONSTRUCTOR_RELAIBLITY'] = data['CONSTRUCTOR_NAME'].apply(lambda x:constructor_relaiblity_dict[x])
-
- #removing retired drivers and constructors
- active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes',
- 'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull',
- 'Haas F1 Team']
- active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz',
- 'Valtteri Bottas', 'Lance Stroll', 'George Russell',
- 'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen',
- 'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat',
- 'Max Verstappen', 'Pierre Gasly', 'Alexander Albon',
- 'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi',
- 'Romain Grosjean','Nicholas Latifi']
-
- # create flags for active drivers and constructors so we can filter downstream
- data['ACTIVE_DRIVER'] = data['DRIVER'].apply(lambda x: int(x in active_drivers))
- data['ACTIVE_CONSTRUCTOR'] = data['CONSTRUCTOR_NAME'].apply(lambda x: int(x in active_constructors))
-
- return data
- ```
-3. As usual, let’s break down what we are doing in this Python model:
- - We’re first referencing our upstream `fct_results` table and casting it to a pandas dataframe.
- - Filtering on years 2010-2020 since we’ll need to clean all our data we are using for prediction (both training and testing).
- - Filling in empty data for `total_pit_stops` and making a mapping active constructors and drivers to avoid erroneous predictions
- - ⚠️ You might be wondering why we didn’t do this upstream in our `fct_results` table! The reason for this is that we want our machine learning cleanup to reflect the year 2020 for our predictions and give us an up-to-date team name. However, for business intelligence purposes we can keep the historical data at that point in time. Instead of thinking of one table as “one source of truth” we are creating different datasets fit for purpose: one for historical descriptions and reporting and another for relevant predictions.
- - Create new confidence features for drivers and constructors
- - Generate flags for the constructors and drivers that were active in 2020
-4. Execute the following in the command bar:
- ```bash
- dbt run --select ml_data_prep
- ```
-5. There are more aspects we could consider for this project, such as normalizing the driver confidence by the number of races entered. Including this would help account for a driver’s history and consider whether they are a new or long-time driver. We’re going to keep it simple for now, but these are some of the ways we can expand and improve our machine learning dbt projects. Breaking down our machine learning prep model:
- - Lambda functions — We use some lambda functions to transform our data without having to create a fully-fledged function using the `def` notation. So what exactly are lambda functions?
- - In Python, a lambda function is a small, anonymous function defined using the keyword "lambda". Lambda functions are used to perform a quick operation, such as a mathematical calculation or a transformation on a list of elements. They are often used in conjunction with higher-order functions, such as `apply`, `map`, `filter`, and `reduce`.
- - `.apply()` method — We used `.apply()` to pass our functions into our lambda expressions to the columns and perform this multiple times in our code. Let’s explain apply a little more:
- - The `.apply()` function in the pandas library is used to apply a function to a specified axis of a DataFrame or a Series. In our case the function we used was our lambda function!
- - The `.apply()` function takes two arguments: the first is the function to be applied, and the second is the axis along which the function should be applied. The axis can be specified as 0 for rows or 1 for columns. We are using the default value of 0 so we aren’t explicitly writing it in the code. This means that the function will be applied to each *row* of the DataFrame or Series.
-6. Let’s look at the preview of our clean dataframe after running our `ml_data_prep` model:
-
-
-## Covariate encoding
-
-In this next part, we’ll be performing covariate encoding. Breaking down this phrase a bit, a *covariate* is a variable that is relevant to the outcome of a study or experiment, and *encoding* refers to the process of converting data (such as text or categorical variables) into a numerical format that can be used as input for a model. This is necessary because most machine learning algorithms can only work with numerical data. Algorithms don’t speak languages, have eyes to see images, etc. so we encode our data into numbers so algorithms can perform tasks by using calculations they otherwise couldn’t.
-
-🧠 We’ll think about this as : “algorithms like numbers”.
-
-1. Create a new file under `ml/prep` called `covariate_encoding` copy the code below and save.
- ```python
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
- from sklearn.linear_model import LogisticRegression
-
- def model(dbt, session):
- # dbt configuration
- dbt.config(packages=["pandas","numpy","scikit-learn"])
-
- # get upstream data
- data = dbt.ref("ml_data_prep").to_pandas()
-
- # list out covariates we want to use in addition to outcome variable we are modeling - position
- covariates = data[['RACE_YEAR','CIRCUIT_NAME','GRID','CONSTRUCTOR_NAME','DRIVER','DRIVERS_AGE_YEARS','DRIVER_CONFIDENCE','CONSTRUCTOR_RELAIBLITY','TOTAL_PIT_STOPS_PER_RACE','ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR', 'POSITION']]
-
- # filter covariates on active drivers and constructors
- # use fil_cov as short for "filtered_covariates"
- fil_cov = covariates[(covariates['ACTIVE_DRIVER']==1)&(covariates['ACTIVE_CONSTRUCTOR']==1)]
-
- # Encode categorical variables using LabelEncoder
- # TODO: we'll update this to both ohe in the future for non-ordinal variables!
- le = LabelEncoder()
- fil_cov['CIRCUIT_NAME'] = le.fit_transform(fil_cov['CIRCUIT_NAME'])
- fil_cov['CONSTRUCTOR_NAME'] = le.fit_transform(fil_cov['CONSTRUCTOR_NAME'])
- fil_cov['DRIVER'] = le.fit_transform(fil_cov['DRIVER'])
- fil_cov['TOTAL_PIT_STOPS_PER_RACE'] = le.fit_transform(fil_cov['TOTAL_PIT_STOPS_PER_RACE'])
-
- # Simply target variable "position" to represent 3 meaningful categories in Formula1
- # 1. Podium position 2. Points for team 3. Nothing - no podium or points!
- def position_index(x):
- if x<4:
- return 1
- if x>10:
- return 3
- else :
- return 2
-
- # we are dropping the columns that we filtered on in addition to our training variable
- encoded_data = fil_cov.drop(['ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR'],1)
- encoded_data['POSITION_LABEL']= encoded_data['POSITION'].apply(lambda x: position_index(x))
- encoded_data_grouped_target = encoded_data.drop(['POSITION'],1)
-
- return encoded_data_grouped_target
- ```
-2. Execute the following in the command bar:
- ```bash
- dbt run --select covariate_encoding
- ```
-3. In this code, we are using a ton of functions from libraries! This is really cool, because we can utilize code other people have developed and bring it into our project simply by using the `import` function. [Scikit-learn](https://scikit-learn.org/stable/), “sklearn” for short, is an extremely popular data science library. Sklearn contains a wide range of machine learning techniques, including supervised and unsupervised learning algorithms, feature scaling and imputation, as well as tools model evaluation and selection. We’ll be using Sklearn for both preparing our covariates and creating models (our next section).
-4. Our dataset is pretty small data so we are good to use pandas and `sklearn`. If you have larger data for your own project in mind, consider `dask` or `category_encoders`.
-5. Breaking it down a bit more:
- - We’re selecting a subset of variables that will be used as predictors for a driver’s position.
- - Filter the dataset to only include rows using the active driver and constructor flags we created in the last step.
- - The next step is to use the `LabelEncoder` from scikit-learn to convert the categorical variables `CIRCUIT_NAME`, `CONSTRUCTOR_NAME`, `DRIVER`, and `TOTAL_PIT_STOPS_PER_RACE` into numerical values.
- - Create a new variable called `POSITION_LABEL`, which is a derived from our position variable.
- - 💭 Why are we changing our position variable? There are 20 total positions in Formula 1 and we are grouping them together to simplify the classification and improve performance. We also want to demonstrate you can create a new function within your dbt model!
- - Our new `position_label` variable has meaning:
- - In Formula1 if you are in:
- - Top 3 you get a “podium” position
- - Top 10 you gain points that add to your overall season total
- - Below top 10 you get no points!
- - We are mapping our original variable position to `position_label` to the corresponding places above to 1,2, and 3 respectively.
- - Drop the active driver and constructor flags since they were filter criteria and additionally drop our original position variable.
-
-## Splitting into training and testing datasets
-
-Now that we’ve cleaned and encoded our data, we are going to further split in by time. In this step, we will create dataframes to use for training and prediction. We’ll be creating two dataframes 1) using data from 2010-2019 for training, and 2) data from 2020 for new prediction inferences. We’ll create variables called `start_year` and `end_year` so we aren’t filtering on hardcasted values (and can more easily swap them out in the future if we want to retrain our model on different timeframes).
-
-1. Create a file called `train_test_dataset` copy and save the following code:
- ```python
- import pandas as pd
-
- def model(dbt, session):
-
- # dbt configuration
- dbt.config(packages=["pandas"], tags="train")
-
- # get upstream data
- encoding = dbt.ref("covariate_encoding").to_pandas()
-
- # provide years so we do not hardcode dates in filter command
- start_year=2010
- end_year=2019
-
- # describe the data for a full decade
- train_test_dataset = encoding.loc[encoding['RACE_YEAR'].between(start_year, end_year)]
-
- return train_test_dataset
- ```
-
-2. Create a file called `hold_out_dataset_for_prediction` copy and save the following code below. Now we’ll have a dataset with only the year 2020 that we’ll keep as a hold out set that we are going to use similar to a deployment use case.
- ```python
- import pandas as pd
-
- def model(dbt, session):
- # dbt configuration
- dbt.config(packages=["pandas"], tags="predict")
-
- # get upstream data
- encoding = dbt.ref("covariate_encoding").to_pandas()
-
- # variable for year instead of hardcoding it
- year=2020
-
- # filter the data based on the specified year
- hold_out_dataset = encoding.loc[encoding['RACE_YEAR'] == year]
-
- return hold_out_dataset
- ```
-3. Execute the following in the command bar:
- ```bash
- dbt run --select train_test_dataset hold_out_dataset_for_prediction
- ```
- To run our temporal data split models, we can use this syntax in the command line to run them both at once. Make sure you use a *space* [syntax](/reference/node-selection/syntax) between the model names to indicate you want to run both!
-4. **Commit and push** our changes to keep saving our work as we go using `ml data prep and splits` before moving on.
-
-👏 Now that we’ve finished our machine learning prep work we can move onto the fun part — training and prediction!
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md
deleted file mode 100644
index 8b353a85fa3..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md
+++ /dev/null
@@ -1,251 +0,0 @@
----
-title: "Machine Learning: training and prediction "
-id: "12-machine-learning-training-prediction"
-description: "Machine Learning: training and prediction"
----
-
-We’re ready to start training a model to predict the driver’s position. Now is a good time to pause and take a step back and say, usually in ML projects you’ll try multiple algorithms during development and use an evaluation method such as cross validation to determine which algorithm to use. You can definitely do this in your dbt project, but for the content of this lab we’ll have decided on using a logistic regression to predict position (we actually tried some other algorithms using cross validation outside of this lab such as k-nearest neighbors and a support vector classifier but that didn’t perform as well as the logistic regression and a decision tree that overfit).
-
-There are 3 areas to break down as we go since we are working at the intersection all within one model file:
-1. Machine Learning
-2. Snowflake and Snowpark
-3. dbt Python models
-
-If you haven’t seen code like this before or use joblib files to save machine learning models, we’ll be going over them at a high level and you can explore the links for more technical in-depth along the way! Because Snowflake and dbt have abstracted away a lot of the nitty gritty about serialization and storing our model object to be called again, we won’t go into too much detail here. There’s *a lot* going on here so take it at your pace!
-
-## Training and saving a machine learning model
-
-1. Project organization remains key, so let’s make a new subfolder called `train_predict` under the `ml` folder.
-2. Now create a new file called `train_test_position` and copy and save the following code:
-
- ```python
- import snowflake.snowpark.functions as F
- from sklearn.model_selection import train_test_split
- import pandas as pd
- from sklearn.metrics import confusion_matrix, balanced_accuracy_score
- import io
- from sklearn.linear_model import LogisticRegression
- from joblib import dump, load
- import joblib
- import logging
- import sys
- from joblib import dump, load
-
- logger = logging.getLogger("mylog")
-
- def save_file(session, model, path, dest_filename):
- input_stream = io.BytesIO()
- joblib.dump(model, input_stream)
- session._conn.upload_stream(input_stream, path, dest_filename)
- return "successfully created file: " + path
-
- def model(dbt, session):
- dbt.config(
- packages = ['numpy','scikit-learn','pandas','numpy','joblib','cachetools'],
- materialized = "table",
- tags = "train"
- )
- # Create a stage in Snowflake to save our model file
- session.sql('create or replace stage MODELSTAGE').collect()
-
- #session._use_scoped_temp_objects = False
- version = "1.0"
- logger.info('Model training version: ' + version)
-
- # read in our training and testing upstream dataset
- test_train_df = dbt.ref("train_test_dataset")
-
- # cast snowpark df to pandas df
- test_train_pd_df = test_train_df.to_pandas()
- target_col = "POSITION_LABEL"
-
- # split out covariate predictors, x, from our target column position_label, y.
- split_X = test_train_pd_df.drop([target_col], axis=1)
- split_y = test_train_pd_df[target_col]
-
- # Split out our training and test data into proportions
- X_train, X_test, y_train, y_test = train_test_split(split_X, split_y, train_size=0.7, random_state=42)
- train = [X_train, y_train]
- test = [X_test, y_test]
- # now we are only training our one model to deploy
- # we are keeping the focus on the workflows and not algorithms for this lab!
- model = LogisticRegression()
-
- # fit the preprocessing pipeline and the model together
- model.fit(X_train, y_train)
- y_pred = model.predict_proba(X_test)[:,1]
- predictions = [round(value) for value in y_pred]
- balanced_accuracy = balanced_accuracy_score(y_test, predictions)
-
- # Save the model to a stage
- save_file(session, model, "@MODELSTAGE/driver_position_"+version, "driver_position_"+version+".joblib" )
- logger.info('Model artifact:' + "@MODELSTAGE/driver_position_"+version+".joblib")
-
- # Take our pandas training and testing dataframes and put them back into snowpark dataframes
- snowpark_train_df = session.write_pandas(pd.concat(train, axis=1, join='inner'), "train_table", auto_create_table=True, create_temp_table=True)
- snowpark_test_df = session.write_pandas(pd.concat(test, axis=1, join='inner'), "test_table", auto_create_table=True, create_temp_table=True)
-
- # Union our training and testing data together and add a column indicating train vs test rows
- return snowpark_train_df.with_column("DATASET_TYPE", F.lit("train")).union(snowpark_test_df.with_column("DATASET_TYPE", F.lit("test")))
- ```
-
-3. Execute the following in the command bar:
- ```bash
- dbt run --select train_test_position
- ```
-4. Breaking down our Python script here:
- - We’re importing some helpful libraries.
- - Defining a function called `save_file()` that takes four parameters: `session`, `model`, `path` and `dest_filename` that will save our logistic regression model file.
- - `session` — an object representing a connection to Snowflake.
- - `model` — an object that needs to be saved. In this case, it's a Python object that is a scikit-learn that can be serialized with joblib.
- - `path` — a string representing the directory or bucket location where the file should be saved.
- - `dest_filename` — a string representing the desired name of the file.
- - Creating our dbt model
- - Within this model we are creating a stage called `MODELSTAGE` to place our logistic regression `joblib` model file. This is really important since we need a place to keep our model to reuse and want to ensure it's there. When using Snowpark commands, it's common to see the `.collect()` method to ensure the action is performed. Think of the session as our “start” and collect as our “end” when [working with Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) (you can use other ending methods other than collect).
- - Using `.ref()` to connect into our `train_test_dataset` model.
- - Now we see the machine learning part of our analysis:
- - Create new dataframes for our prediction features from our target variable `position_label`.
- - Split our dataset into 70% training (and 30% testing), train_size=0.7 with a `random_state` specified to have repeatable results.
- - Specify our model is a logistic regression.
- - Fit our model. In a logistic regression this means finding the coefficients that will give the least classification error.
- - Round our predictions to the nearest integer since logistic regression creates a probability between for each class and calculate a balanced accuracy to account for imbalances in the target variable.
- - Right now our model is only in memory, so we need to use our nifty function `save_file` to save our model file to our Snowflake stage. We save our model as a joblib file so Snowpark can easily call this model object back to create predictions. We really don’t need to know much else as a data practitioner unless we want to. It’s worth noting that joblib files aren’t able to be queried directly by SQL. To do this, we would need to transform the joblib file to an SQL querable format such as JSON or CSV (out of scope for this workshop).
- - Finally we want to return our dataframe, but create a new column indicating what rows were used for training and those for training.
-5. Viewing our output of this model:
-
-
-6. Let’s pop back over to Snowflake and check that our logistic regression model has been stored in our `MODELSTAGE` using the command:
- ```sql
- list @modelstage
- ```
-
-
-7. To investigate the commands run as part of `train_test_position` script, navigate to Snowflake query history to view it **Activity > Query History**. We can view the portions of query that we wrote such as `create or replace stage MODELSTAGE`, but we also see additional queries that Snowflake uses to interpret python code.
-
-
-## Predicting on new data
-
-1. Create a new file called `predict_position` and copy and save the following code:
- ```python
- import logging
- import joblib
- import pandas as pd
- import os
- from snowflake.snowpark import types as T
-
- DB_STAGE = 'MODELSTAGE'
- version = '1.0'
- # The name of the model file
- model_file_path = 'driver_position_'+version
- model_file_packaged = 'driver_position_'+version+'.joblib'
-
- # This is a local directory, used for storing the various artifacts locally
- LOCAL_TEMP_DIR = f'/tmp/driver_position'
- DOWNLOAD_DIR = os.path.join(LOCAL_TEMP_DIR, 'download')
- TARGET_MODEL_DIR_PATH = os.path.join(LOCAL_TEMP_DIR, 'ml_model')
- TARGET_LIB_PATH = os.path.join(LOCAL_TEMP_DIR, 'lib')
-
- # The feature columns that were used during model training
- # and that will be used during prediction
- FEATURE_COLS = [
- "RACE_YEAR"
- ,"CIRCUIT_NAME"
- ,"GRID"
- ,"CONSTRUCTOR_NAME"
- ,"DRIVER"
- ,"DRIVERS_AGE_YEARS"
- ,"DRIVER_CONFIDENCE"
- ,"CONSTRUCTOR_RELAIBLITY"
- ,"TOTAL_PIT_STOPS_PER_RACE"]
-
- def register_udf_for_prediction(p_predictor ,p_session ,p_dbt):
-
- # The prediction udf
-
- def predict_position(p_df: T.PandasDataFrame[int, int, int, int,
- int, int, int, int, int]) -> T.PandasSeries[int]:
- # Snowpark currently does not set the column name in the input dataframe
- # The default col names are like 0,1,2,... Hence we need to reset the column
- # names to the features that we initially used for training.
- p_df.columns = [*FEATURE_COLS]
-
- # Perform prediction. this returns an array object
- pred_array = p_predictor.predict(p_df)
- # Convert to series
- df_predicted = pd.Series(pred_array)
- return df_predicted
-
- # The list of packages that will be used by UDF
- udf_packages = p_dbt.config.get('packages')
-
- predict_position_udf = p_session.udf.register(
- predict_position
- ,name=f'predict_position'
- ,packages = udf_packages
- )
- return predict_position_udf
-
- def download_models_and_libs_from_stage(p_session):
- p_session.file.get(f'@{DB_STAGE}/{model_file_path}/{model_file_packaged}', DOWNLOAD_DIR)
-
- def load_model(p_session):
- # Load the model and initialize the predictor
- model_fl_path = os.path.join(DOWNLOAD_DIR, model_file_packaged)
- predictor = joblib.load(model_fl_path)
- return predictor
-
- # -------------------------------
- def model(dbt, session):
- dbt.config(
- packages = ['snowflake-snowpark-python' ,'scipy','scikit-learn' ,'pandas' ,'numpy'],
- materialized = "table",
- tags = "predict"
- )
- session._use_scoped_temp_objects = False
- download_models_and_libs_from_stage(session)
- predictor = load_model(session)
- predict_position_udf = register_udf_for_prediction(predictor, session ,dbt)
-
- # Retrieve the data, and perform the prediction
- hold_out_df = (dbt.ref("hold_out_dataset_for_prediction")
- .select(*FEATURE_COLS)
- )
-
- # Perform prediction.
- new_predictions_df = hold_out_df.withColumn("position_predicted"
- ,predict_position_udf(*FEATURE_COLS)
- )
-
- return new_predictions_df
- ```
-2. Execute the following in the command bar:
- ```bash
- dbt run --select predict_position
- ```
-3. **Commit and push** our changes to keep saving our work as we go using the commit message `logistic regression model training and application` before moving on.
-4. At a high level in this script, we are:
- - Retrieving our staged logistic regression model
- - Loading the model in
- - Placing the model within a user defined function (UDF) to call in line predictions on our driver’s position
-5. At a more detailed level:
- - Import our libraries.
- - Create variables to reference back to the `MODELSTAGE` we just created and stored our model to.
- - The temporary file paths we created might look intimidating, but all we’re doing here is programmatically using an initial file path and adding to it to create the following directories:
- - LOCAL_TEMP_DIR ➡️ /tmp/driver_position
- - DOWNLOAD_DIR ➡️ /tmp/driver_position/download
- - TARGET_MODEL_DIR_PATH ➡️ /tmp/driver_position/ml_model
- - TARGET_LIB_PATH ➡️ /tmp/driver_position/lib
- - Provide a list of our feature columns that we used for model training and will now be used on new data for prediction.
- - Next, we are creating our main function `register_udf_for_prediction(p_predictor ,p_session ,p_dbt):`. This function is used to register a user-defined function (UDF) that performs the machine learning prediction. It takes three parameters: `p_predictor` is an instance of the machine learning model, `p_session` is an instance of the Snowflake session, and `p_dbt` is an instance of the dbt library. The function creates a UDF named `predict_churn` which takes a pandas dataframe with the input features and returns a pandas series with the predictions.
- - ⚠️ Pay close attention to the whitespace here. We are using a function within a function for this script.
- - We have 2 simple functions that are programmatically retrieving our file paths to first get our stored model out of our `MODELSTAGE` and downloaded into the session `download_models_and_libs_from_stage` and then to load the contents of our model in (parameters) in `load_model` to use for prediction.
- - Take the model we loaded in and call it `predictor` and wrap it in a UDF.
- - Return our dataframe with both the features used to predict and the new label.
-
-🧠 Another way to read this script is from the bottom up. This can help us progressively see what is going into our final dbt model and work backwards to see how the other functions are being referenced.
-
-6. Let’s take a look at our predicted position alongside our feature variables. Open a new scratchpad and use the following query. I chose to order by the prediction of who would obtain a podium position:
- ```sql
- select * from {{ ref('predict_position') }} order by position_predicted
- ```
-7. We can see that we created predictions in our final dataset, we are ready to move on to testing!
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md
deleted file mode 100644
index bcda9a775fb..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md
+++ /dev/null
@@ -1,136 +0,0 @@
----
-title: "Testing"
-id: "13-testing"
-description: "Testing"
----
-We have now completed building all the models for today’s lab, but how do we know if they meet our assertions? Put another way, how do we know the quality of our data models are any good? This brings us to testing!
-
-We test data models for mainly two reasons:
-
-- Ensure that our source data is clean on ingestion before we start data modeling/transformation (aka avoid garbage in, garbage out problem).
-- Make sure we don’t introduce bugs in the transformation code we wrote (stop ourselves from creating bad joins/fanouts).
-
-Testing in dbt comes in two flavors: [generic](/docs/build/tests#generic-tests) and [singular](/docs/build/tests#singular-tests).
-
-You define them in a test block (similar to a macro) and once defined, you can reference them by name in your `.yml` files (applying them to models, columns, sources, snapshots, and seeds).
-
-You might be wondering: *what about testing Python models?*
-
-Since the output of our Python models are tables, we can test SQL and Python models the same way! We don’t have to worry about any syntax differences when testing SQL versus Python data models. This means we use `.yml` and `.sql` files to test our entities (tables, views, etc.). Under the hood, dbt is running an SQL query on our tables to see if they meet assertions. If no rows are returned, dbt will surface a passed test. Conversely, if a test results in returned rows, it will fail or warn depending on the configuration (more on that later).
-
-## Generic tests
-
-1. To implement generic out-of-the-box tests dbt comes with, we can use YAML files to specify information about our models. To add generic tests to our aggregates model, create a file called `aggregates.yml`, copy the code block below into the file, and save.
-
-
- ```yaml
- version: 2
-
- models:
- - name: fastest_pit_stops_by_constructor
- description: Use the python .describe() method to retrieve summary statistics table about pit stops by constructor. Sort by average stop time ascending so the first row returns the fastest constructor.
- columns:
- - name: constructor_name
- description: team that makes the car
- tests:
- - unique
-
- - name: lap_times_moving_avg
- description: Use the python .rolling() method to calculate the 5 year rolling average of pit stop times alongside the average for each year.
- columns:
- - name: race_year
- description: year of the race
- tests:
- - relationships:
- to: ref('int_lap_times_years')
- field: race_year
- ```
-
-2. Let’s unpack the code we have here. We have both our aggregates models with the model name to know the object we are referencing and the description of the model that we’ll populate in our documentation. At the column level (a level below our model), we are providing the column name followed by our tests. We want to ensure our `constructor_name` is unique since we used a pandas `groupby` on `constructor_name` in the model `fastest_pit_stops_by_constructor`. Next, we want to ensure our `race_year` has referential integrity from the model we selected from `int_lap_times_years` into our subsequent `lap_times_moving_avg` model.
-3. Finally, if we want to see how tests were deployed on sources and SQL models, we can look at other files in our project such as the `f1_sources.yml` we created in our Sources and staging section.
-
-## Using macros for testing
-
-1. Under your `macros` folder, create a new file and name it `test_all_values_gte_zero.sql`. Copy the code block below and save the file. For clarity, “gte” is an abbreviation for greater than or equal to.
-
-
- ```sql
- {% macro test_all_values_gte_zero(table, column) %}
-
- select * from {{ ref(table) }} where {{ column }} < 0
-
- {% endmacro %}
- ```
-
-2. Macros in Jinja are pieces of code that can be reused multiple times in our SQL models — they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models.
-3. We use the `{% macro %}` to indicate the start of the macro and `{% endmacro %}` for the end. The text after the beginning of the macro block is the name we are giving the macro to later call it. In this case, our macro is called `test_all_values_gte_zero`. Macros take in *arguments* to pass through, in this case the `table` and the `column`. In the body of the macro, we see an SQL statement that is using the `ref` function to dynamically select the table and then the column. You can always view macros without having to run them by using `dbt run-operation`. You can learn more [here](https://docs.getdbt.com/reference/commands/run-operation).
-4. Great, now we want to reference this macro as a test! Let’s create a new test file called `macro_pit_stops_mean_is_positive.sql` in our `tests` folder.
-
-
-
-5. Copy the following code into the file and save:
-
- ```sql
- {{
- config(
- enabled=true,
- severity='warn',
- tags = ['bi']
- )
- }}
-
- {{ test_all_values_gte_zero('fastest_pit_stops_by_constructor', 'mean') }}
- ```
-
-6. In our testing file, we are applying some configurations to the test including `enabled`, which is an optional configuration for disabling models, seeds, snapshots, and tests. Our severity is set to `warn` instead of `error`, which means our pipeline will still continue to run. We have tagged our test with `bi` since we are applying this test to one of our bi models.
-
-Then, in our final line, we are calling the `test_all_values_gte_zero` macro that takes in our table and column arguments and inputting our table `'fastest_pit_stops_by_constructor'` and the column `'mean'`.
-
-## Custom singular tests to validate Python models
-
-The simplest way to define a test is by writing the exact SQL that will return failing records. We call these "singular" tests, because they're one-off assertions usable for a single purpose.
-
-These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your test-paths config). You can use Jinja in SQL models (including ref and source) in the test definition, just like you can when creating models. Each `.sql` file contains one select statement, and it defines one test.
-
-Let’s add a custom test that asserts that the moving average of the lap time over the last 5 years is greater than zero (it’s impossible to have time less than 0!). It is easy to assume if this is not the case the data has been corrupted.
-
-1. Create a file `lap_times_moving_avg_assert_positive_or_null.sql` under the `tests` folder.
-
-
-2. Copy the following code and save the file:
-
- ```sql
- {{
- config(
- enabled=true,
- severity='error',
- tags = ['bi']
- )
- }}
-
- with lap_times_moving_avg as ( select * from {{ ref('lap_times_moving_avg') }} )
-
- select *
- from lap_times_moving_avg
- where lap_moving_avg_5_years < 0 and lap_moving_avg_5_years is not null
- ```
-
-## Putting all our tests together
-
-1. Time to run our tests! Altogether, we have created 4 tests for our 2 Python models:
- - `fastest_pit_stops_by_constructor`
- - Unique `constructor_name`
- - Lap times are greater than 0 or null (to allow for the first leading values in a rolling calculation)
- - `lap_times_moving_avg`
- - Referential test on `race_year`
- - Mean pit stop times are greater than or equal to 0 (no negative time values)
-2. To run the tests on both our models, we can use this syntax in the command line to run them both at once, similar to how we did our data splits earlier.
- Execute the following in the command bar:
- ```bash
- dbt test --select fastest_pit_stops_by_constructor lap_times_moving_avg
- ```
-
-
-3. All 4 of our tests passed (yay for clean data)! To understand the SQL being run against each of our tables, we can click into the details of the test.
-4. Navigating into the **Details** of the `unique_fastest_pit_stops_by_constructor_name`, we can see that each line `constructor_name` should only have one row.
-
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md
deleted file mode 100644
index 95ec8ad242f..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md
+++ /dev/null
@@ -1,29 +0,0 @@
----
-title: "Documentation"
-id: "14-documentation"
-description: "Documentation"
----
-When it comes to documentation, dbt brings together both column and model level descriptions that you can provide as well as details from your Snowflake information schema in a static site for consumption by other data team members and stakeholders.
-
-We are going to revisit 2 areas of our project to understand our documentation:
-
-- `intermediate.md` file
-- `dbt_project.yml` file
-
-To start, let’s look back at our `intermediate.md` file. We can see that we provided multi-line descriptions for the models in our intermediate models using [docs blocks](/docs/collaborate/documentation#using-docs-blocks). Then we reference these docs blocks in our `.yml` file. Building descriptions with doc blocks in Markdown files gives you the ability to format your descriptions with Markdown and are particularly helpful when building long descriptions, either at the column or model level. In our `dbt_project.yml`, we added `node_colors` at folder levels.
-
-1. To see all these pieces come together, execute this in the command bar:
- ```bash
- dbt docs generate
- ```
- This will generate the documentation for your project. Click the book button, as shown in the screenshot below to access the docs.
-
-
-2. Go to our project area and view `int_results`. View the description that we created in our doc block.
-
-
-3. View the mini-lineage that looks at the model we are currently selected on (`int_results` in this case).
-
-
-4. In our `dbt_project.yml`, we configured `node_colors` depending on the file directory. Starting in dbt v1.3, we can see how our lineage in our docs looks. By color coding your project, it can help you cluster together similar models or steps and more easily troubleshoot.
-
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md
deleted file mode 100644
index d9cedb60861..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md
+++ /dev/null
@@ -1,50 +0,0 @@
----
-title: "Deployment"
-id: "15-deployment"
-description: "Deployment"
----
-
-Before we jump into deploying our code, let's have a quick primer on environments. Up to this point, all of the work we've done in the dbt Cloud IDE has been in our development environment, with code committed to a feature branch and the models we've built created in our development schema in Snowflake as defined in our Development environment connection. Doing this work on a feature branch, allows us to separate our code from what other coworkers are building and code that is already deemed production ready. Building models in a development schema in Snowflake allows us to separate the database objects we might still be modifying and testing from the database objects running production dashboards or other downstream dependencies. Together, the combination of a Git branch and Snowflake database objects form our environment.
-
-Now that we've completed testing and documenting our work, we're ready to deploy our code from our development environment to our production environment and this involves two steps:
-
-- Promoting code from our feature branch to the production branch in our repository.
- - Generally, the production branch is going to be named your main branch and there's a review process to go through before merging code to the main branch of a repository. Here we are going to merge without review for ease of this workshop.
-- Deploying code to our production environment.
- - Once our code is merged to the main branch, we'll need to run dbt in our production environment to build all of our models and run all of our tests. This will allow us to build production-ready objects into our production environment in Snowflake. Luckily for us, the Partner Connect flow has already created our deployment environment and job to facilitate this step.
-
-1. Before getting started, let's make sure that we've committed all of our work to our feature branch. If you still have work to commit, you'll be able to select the **Commit and push**, provide a message, and then select **Commit** again.
-2. Once all of your work is committed, the git workflow button will now appear as **Merge to main**. Select **Merge to main** and the merge process will automatically run in the background.
-
-
-3. When it's completed, you should see the git button read **Create branch** and the branch you're currently looking at will become **main**.
-4. Now that all of our development work has been merged to the main branch, we can build our deployment job. Given that our production environment and production job were created automatically for us through Partner Connect, all we need to do here is update some default configurations to meet our needs.
-5. In the menu, select **Deploy** **> Environments**
-
-
-6. You should see two environments listed and you'll want to select the **Deployment** environment then **Settings** to modify it.
-7. Before making any changes, let's touch on what is defined within this environment. The Snowflake connection shows the credentials that dbt Cloud is using for this environment and in our case they are the same as what was created for us through Partner Connect. Our deployment job will build in our `PC_DBT_DB` database and use the default Partner Connect role and warehouse to do so. The deployment credentials section also uses the info that was created in our Partner Connect job to create the credential connection. However, it is using the same default schema that we've been using as the schema for our development environment.
-8. Let's update the schema to create a new schema specifically for our production environment. Click **Edit** to allow you to modify the existing field values. Navigate to **Deployment Credentials >** **schema.**
-9. Update the schema name to **production**. Remember to select **Save** after you've made the change.
-
-10. By updating the schema for our production environment to **production**, it ensures that our deployment job for this environment will build our dbt models in the **production** schema within the `PC_DBT_DB` database as defined in the Snowflake Connection section.
-11. Now let's switch over to our production job. Click on the deploy tab again and then select **Jobs**. You should see an existing and preconfigured **Partner Connect Trial Job**. Similar to the environment, click on the job, then select **Settings** to modify it. Let's take a look at the job to understand it before making changes.
-
- - The Environment section is what connects this job with the environment we want it to run in. This job is already defaulted to use the Deployment environment that we just updated and the rest of the settings we can keep as is.
- - The Execution settings section gives us the option to generate docs, run source freshness, and defer to a previous run state. For the purposes of our lab, we're going to keep these settings as is as well and stick with just generating docs.
- - The Commands section is where we specify exactly which commands we want to run during this job, and we also want to keep this as is. We want our seed to be uploaded first, then run our models, and finally test them. The order of this is important as well, considering that we need our seed to be created before we can run our incremental model, and we need our models to be created before we can test them.
- - Finally, we have the Triggers section, where we have a number of different options for scheduling our job. Given that our data isn't updating regularly here and we're running this job manually for now, we're also going to leave this section alone.
-
- So, what are we changing then? Just the name! Click **Edit** to allow you to make changes. Then update the name of the job to **Production Job** to denote this as our production deployment job. After that's done, click **Save**.
-12. Now let's go to run our job. Clicking on the job name in the path at the top of the screen will take you back to the job run history page where you'll be able to click **Run run** to kick off the job. If you encounter any job failures, try running the job again before further troubleshooting.
-
-
-
-13. Let's go over to Snowflake to confirm that everything built as expected in our production schema. Refresh the database objects in your Snowflake account and you should see the production schema now within our default Partner Connect database. If you click into the schema and everything ran successfully, you should be able to see all of the models we developed.
-
-
-## Conclusion
-
-Fantastic! You’ve finished the workshop! We hope you feel empowered in using both SQL and Python in your dbt Cloud workflows with Snowflake. Having a reliable pipeline to surface both analytics and machine learning is crucial to creating tangible business value from your data.
-
-For more help and information join our [dbt community Slack](https://www.getdbt.com/community/) which contains more than 50,000 data practitioners today. We have a dedicated slack channel #db-snowflake to Snowflake related content. Happy dbt'ing!
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md
deleted file mode 100644
index e864c363a44..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md
+++ /dev/null
@@ -1,27 +0,0 @@
----
-title: "Configure Snowflake"
-id: "2-snowflake-configuration"
-description: "Configure Snowflake"
----
-
-
-1. Log in to your trial Snowflake account. You can [sign up for a Snowflake Trial Account using this form](https://signup.snowflake.com/) if you don’t have one.
-2. Ensure that your account is set up using **AWS** in the **US East (N. Virginia)**. We will be copying the data from a public AWS S3 bucket hosted by dbt Labs in the us-east-1 region. By ensuring our Snowflake environment setup matches our bucket region, we avoid any multi-region data copy and retrieval latency issues.
-
-
-
-3. After creating your account and verifying it from your sign-up email, Snowflake will direct you back to the UI called Snowsight.
-
-4. When Snowsight first opens, your window should look like the following, with you logged in as the ACCOUNTADMIN with demo worksheets open:
-
-
-
-
-5. Navigate to **Admin > Billing & Terms**. Click **Enable > Acknowledge & Continue** to enable Anaconda Python Packages to run in Snowflake.
-
-
-
-
-
-6. Finally, create a new Worksheet by selecting **+ Worksheet** in the upper right corner.
-
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md
deleted file mode 100644
index 9a41e7f45c5..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md
+++ /dev/null
@@ -1,192 +0,0 @@
----
-title: "Connect to data source"
-id: "3-connect-to-data-source"
-description: "Connect to data source"
----
-
-We need to obtain our data source by copying our Formula 1 data into Snowflake tables from a public S3 bucket that dbt Labs hosts.
-
-1. When a new Snowflake account is created, there should be a preconfigured warehouse in your account named `COMPUTE_WH`.
-2. If for any reason your account doesn’t have this warehouse, we can create a warehouse using the following script:
-
- ```sql
- create or replace warehouse COMPUTE_WH with warehouse_size=XSMALL
- ```
-3. Rename the worksheet to `data setup script` since we will be placing code in this worksheet to ingest the Formula 1 data. Make sure you are still logged in as the **ACCOUNTADMIN** and select the **COMPUTE_WH** warehouse.
-
-
-
-4. Copy the following code into the main body of the Snowflake worksheet. You can also find this setup script under the `setup` folder in the [Git repository](https://github.com/dbt-labs/python-snowpark-formula1/blob/main/setup/setup_script_s3_to_snowflake.sql). The script is long since it's bring in all of the data we'll need today!
-
- ```sql
- -- create and define our formula1 database
- create or replace database formula1;
- use database formula1;
- create or replace schema raw;
- use schema raw;
-
- -- define our file format for reading in the csvs
- create or replace file format csvformat
- type = csv
- field_delimiter =','
- field_optionally_enclosed_by = '"',
- skip_header=1;
-
- --
- create or replace stage formula1_stage
- file_format = csvformat
- url = 's3://formula1-dbt-cloud-python-demo/formula1-kaggle-data/';
-
- -- load in the 8 tables we need for our demo
- -- we are first creating the table then copying our data in from s3
- -- think of this as an empty container or shell that we are then filling
- create or replace table formula1.raw.circuits (
- CIRCUITID NUMBER(38,0),
- CIRCUITREF VARCHAR(16777216),
- NAME VARCHAR(16777216),
- LOCATION VARCHAR(16777216),
- COUNTRY VARCHAR(16777216),
- LAT FLOAT,
- LNG FLOAT,
- ALT NUMBER(38,0),
- URL VARCHAR(16777216)
- );
- -- copy our data from public s3 bucket into our tables
- copy into circuits
- from @formula1_stage/circuits.csv
- on_error='continue';
-
- create or replace table formula1.raw.constructors (
- CONSTRUCTORID NUMBER(38,0),
- CONSTRUCTORREF VARCHAR(16777216),
- NAME VARCHAR(16777216),
- NATIONALITY VARCHAR(16777216),
- URL VARCHAR(16777216)
- );
- copy into constructors
- from @formula1_stage/constructors.csv
- on_error='continue';
-
- create or replace table formula1.raw.drivers (
- DRIVERID NUMBER(38,0),
- DRIVERREF VARCHAR(16777216),
- NUMBER VARCHAR(16777216),
- CODE VARCHAR(16777216),
- FORENAME VARCHAR(16777216),
- SURNAME VARCHAR(16777216),
- DOB DATE,
- NATIONALITY VARCHAR(16777216),
- URL VARCHAR(16777216)
- );
- copy into drivers
- from @formula1_stage/drivers.csv
- on_error='continue';
-
- create or replace table formula1.raw.lap_times (
- RACEID NUMBER(38,0),
- DRIVERID NUMBER(38,0),
- LAP NUMBER(38,0),
- POSITION FLOAT,
- TIME VARCHAR(16777216),
- MILLISECONDS NUMBER(38,0)
- );
- copy into lap_times
- from @formula1_stage/lap_times.csv
- on_error='continue';
-
- create or replace table formula1.raw.pit_stops (
- RACEID NUMBER(38,0),
- DRIVERID NUMBER(38,0),
- STOP NUMBER(38,0),
- LAP NUMBER(38,0),
- TIME VARCHAR(16777216),
- DURATION VARCHAR(16777216),
- MILLISECONDS NUMBER(38,0)
- );
- copy into pit_stops
- from @formula1_stage/pit_stops.csv
- on_error='continue';
-
- create or replace table formula1.raw.races (
- RACEID NUMBER(38,0),
- YEAR NUMBER(38,0),
- ROUND NUMBER(38,0),
- CIRCUITID NUMBER(38,0),
- NAME VARCHAR(16777216),
- DATE DATE,
- TIME VARCHAR(16777216),
- URL VARCHAR(16777216),
- FP1_DATE VARCHAR(16777216),
- FP1_TIME VARCHAR(16777216),
- FP2_DATE VARCHAR(16777216),
- FP2_TIME VARCHAR(16777216),
- FP3_DATE VARCHAR(16777216),
- FP3_TIME VARCHAR(16777216),
- QUALI_DATE VARCHAR(16777216),
- QUALI_TIME VARCHAR(16777216),
- SPRINT_DATE VARCHAR(16777216),
- SPRINT_TIME VARCHAR(16777216)
- );
- copy into races
- from @formula1_stage/races.csv
- on_error='continue';
-
- create or replace table formula1.raw.results (
- RESULTID NUMBER(38,0),
- RACEID NUMBER(38,0),
- DRIVERID NUMBER(38,0),
- CONSTRUCTORID NUMBER(38,0),
- NUMBER NUMBER(38,0),
- GRID NUMBER(38,0),
- POSITION FLOAT,
- POSITIONTEXT VARCHAR(16777216),
- POSITIONORDER NUMBER(38,0),
- POINTS NUMBER(38,0),
- LAPS NUMBER(38,0),
- TIME VARCHAR(16777216),
- MILLISECONDS NUMBER(38,0),
- FASTESTLAP NUMBER(38,0),
- RANK NUMBER(38,0),
- FASTESTLAPTIME VARCHAR(16777216),
- FASTESTLAPSPEED FLOAT,
- STATUSID NUMBER(38,0)
- );
- copy into results
- from @formula1_stage/results.csv
- on_error='continue';
-
- create or replace table formula1.raw.status (
- STATUSID NUMBER(38,0),
- STATUS VARCHAR(16777216)
- );
- copy into status
- from @formula1_stage/status.csv
- on_error='continue';
-
- ```
-5. Ensure all the commands are selected before running the query — an easy way to do this is to use Ctrl-a to highlight all of the code in the worksheet. Select **run** (blue triangle icon). Notice how the dot next to your **COMPUTE_WH** turns from gray to green as you run the query. The **status** table is the final table of all 8 tables loaded in.
-
-
-
-6. Let’s unpack that pretty long query we ran into component parts. We ran this query to load in our 8 Formula 1 tables from a public S3 bucket. To do this, we:
- - Created a new database called `formula1` and a schema called `raw` to place our raw (untransformed) data into.
- - Defined our file format for our CSV files. Importantly, here we use a parameter called `field_optionally_enclosed_by =` since the string columns in our Formula 1 csv files use quotes. Quotes are used around string values to avoid parsing issues where commas `,` and new lines `/n` in data values could cause data loading errors.
- - Created a stage to locate our data we are going to load in. Snowflake Stages are locations where data files are stored. Stages are used to both load and unload data to and from Snowflake locations. Here we are using an external stage, by referencing an S3 bucket.
- - Created our tables for our data to be copied into. These are empty tables with the column name and data type. Think of this as creating an empty container that the data will then fill into.
- - Used the `copy into` statement for each of our tables. We reference our staged location we created and upon loading errors continue to load in the rest of the data. You should not have data loading errors but if you do, those rows will be skipped and Snowflake will tell you which rows caused errors
-
-7. Now let's take a look at some of our cool Formula 1 data we just loaded up!
- 1. Create a new worksheet by selecting the **+** then **New Worksheet**.
-
- 2. Navigate to **Database > Formula1 > RAW > Tables**.
- 3. Query the data using the following code. There are only 76 rows in the circuits table, so we don’t need to worry about limiting the amount of data we query.
- ```sql
- select * from formula1.raw.circuits
- ```
- 4. Run the query. From here on out, we’ll use the keyboard shortcuts Command-Enter or Control-Enter to run queries and won’t explicitly call out this step.
- 5. Review the query results, you should see information about Formula 1 circuits, starting with Albert Park in Australia!
- 6. Finally, ensure you have all 8 tables starting with `CIRCUITS` and ending with `STATUS`. Now we are ready to connect into dbt Cloud!
-
-
-
-
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md
deleted file mode 100644
index 21eaa7e8d7f..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md
+++ /dev/null
@@ -1,27 +0,0 @@
----
-title: "Configure dbt"
-id: "4-configure-dbt"
-description: "Configure dbt"
----
-
-1. We are going to be using [Snowflake Partner Connect](https://docs.snowflake.com/en/user-guide/ecosystem-partner-connect.html) to set up a dbt Cloud account. Using this method will allow you to spin up a fully fledged dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [managed repository](/docs/collaborate/git/managed-repository), environments, and credentials already established.
-2. Navigate out of your worksheet back by selecting **home**.
-3. In Snowsight, confirm that you are using the **ACCOUNTADMIN** role.
-4. Navigate to the **Admin** **> Partner Connect**. Find **dbt** either by using the search bar or navigating the **Data Integration**. Select the **dbt** tile.
-
-5. You should now see a new window that says **Connect to dbt**. Select **Optional Grant** and add the `FORMULA1` database. This will grant access for your new dbt user role to the FORMULA1 database.
-
-
-6. Ensure the `FORMULA1` is present in your optional grant before clicking **Connect**. This will create a dedicated dbt user, database, warehouse, and role for your dbt Cloud trial.
-
-
-
-7. When you see the **Your partner account has been created** window, click **Activate**.
-
-8. You should be redirected to a dbt Cloud registration page. Fill out the form. Make sure to save the password somewhere for login in the future.
-
-
-
-9. Select **Complete Registration**. You should now be redirected to your dbt Cloud account, complete with a connection to your Snowflake account, a deployment and a development environment, and a sample job.
-
-10. To help you version control your dbt project, we have connected it to a [managed repository](/docs/collaborate/git/managed-repository), which means that dbt Labs will be hosting your repository for you. This will give you access to a Git workflow without you having to create and host the repository yourself. You will not need to know Git for this workshop; dbt Cloud will help guide you through the workflow. In the future, when you’re developing your own project, [feel free to use your own repository](/docs/cloud/git/connect-github). This will allow you to learn more about features like [Slim CI](/docs/deploy/continuous-integration) builds after this workshop.
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md
deleted file mode 100644
index f098c47bdad..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md
+++ /dev/null
@@ -1,46 +0,0 @@
----
-title: "Development schema name and IDE walkthrough"
-id: "5-development-schema-name"
-description: "Development schema name and IDE walkthrough"
----
-
-1. First we are going to change the name of our default schema to where our dbt models will build. By default, the name is `dbt_`. We will change this to `dbt_` to create your own personal development schema. To do this, select **Profile Settings** from the gear icon in the upper right.
-
-
-
-2. Navigate to the **Credentials** menu and select **Partner Connect Trial**, which will expand the credentials menu.
-
-
-
-3. Click **Edit** and change the name of your schema from `dbt_` to `dbt_YOUR_NAME` replacing `YOUR_NAME` with your initials and name (`hwatson` is used in the lab screenshots). Be sure to click **Save** for your changes!
-
-
-4. We now have our own personal development schema, amazing! When we run our first dbt models they will build into this schema.
-5. Let’s open up dbt Cloud’s Integrated Development Environment (IDE) and familiarize ourselves. Choose **Develop** at the top of the UI.
-
-6. When the IDE is done loading, click **Initialize dbt project**. The initialization process creates a collection of files and folders necessary to run your dbt project.
-
-
-7. After the initialization is finished, you can view the files and folders in the file tree menu. As we move through the workshop we'll be sure to touch on a few key files and folders that we'll work with to build out our project.
-8. Next click **Commit and push** to commit the new files and folders from the initialize step. We always want our commit messages to be relevant to the work we're committing, so be sure to provide a message like `initialize project` and select **Commit Changes**.
-
-
-
-
-
-9. [Committing](https://www.atlassian.com/git/tutorials/saving-changes/git-commit) your work here will save it to the managed git repository that was created during the Partner Connect signup. This initial commit is the only commit that will be made directly to our `main` branch and from *here on out we'll be doing all of our work on a development branch*. This allows us to keep our development work separate from our production code.
-10. There are a couple of key features to point out about the IDE before we get to work. It is a text editor, an SQL and Python runner, and a CLI with Git version control all baked into one package! This allows you to focus on editing your SQL and Python files, previewing the results with the SQL runner (it even runs Jinja!), and building models at the command line without having to move between different applications. The Git workflow in dbt Cloud allows both Git beginners and experts alike to be able to easily version control all of their work with a couple clicks.
-
-
-
-11. Let's run our first dbt models! Two example models are included in your dbt project in the `models/examples` folder that we can use to illustrate how to run dbt at the command line. Type `dbt run` into the command line and click **Enter** on your keyboard. When the run bar expands you'll be able to see the results of the run, where you should see the run complete successfully.
-
-
-
-12. The run results allow you to see the code that dbt compiles and sends to Snowflake for execution. To view the logs for this run, select one of the model tabs using the **>** icon and then **Details**. If you scroll down a bit you'll be able to see the compiled code and how dbt interacts with Snowflake. Given that this run took place in our development environment, the models were created in your development schema.
-
-
-
-
-13. Now let's switch over to Snowflake to confirm that the objects were actually created. Click on the three dots **…** above your database objects and then **Refresh**. Expand the **PC_DBT_DB** database and you should see your development schema. Select the schema, then **Tables** and **Views**. Now you should be able to see `MY_FIRST_DBT_MODEL` as a table and `MY_SECOND_DBT_MODEL` as a view.
-
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md
deleted file mode 100644
index e387b208dd1..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md
+++ /dev/null
@@ -1,80 +0,0 @@
----
-title: "Foundational structure"
-id: "6-foundational-structure"
-description: "Foundational structure"
----
-
-In this step, we’ll need to create a development branch and set up project level configurations.
-
-1. To get started with development for our project, we'll need to create a new Git branch for our work. Select **create branch** and name your development branch. We'll call our branch `snowpark_python_workshop` then click **Submit**.
-2. The first piece of development we'll do on the project is to update the `dbt_project.yml` file. Every dbt project requires a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. The [dbt_project.yml](/reference/dbt_project.yml) file also contains important information that tells dbt how to operate on your project.
-3. Select the `dbt_project.yml` file from the file tree to open it and replace all of the existing contents with the following code below. When you're done, save the file by clicking **save**. You can also use the Command-S or Control-S shortcut from here on out.
-
- ```yaml
- # Name your project! Project names should contain only lowercase characters
- # and underscores. A good package name should reflect your organization's
- # name or the intended use of these models
- name: 'snowflake_dbt_python_formula1'
- version: '1.3.0'
- require-dbt-version: '>=1.3.0'
- config-version: 2
-
- # This setting configures which "profile" dbt uses for this project.
- profile: 'default'
-
- # These configurations specify where dbt should look for different types of files.
- # The `model-paths` config, for example, states that models in this project can be
- # found in the "models/" directory. You probably won't need to change these!
- model-paths: ["models"]
- analysis-paths: ["analyses"]
- test-paths: ["tests"]
- seed-paths: ["seeds"]
- macro-paths: ["macros"]
- snapshot-paths: ["snapshots"]
-
- target-path: "target" # directory which will store compiled SQL files
- clean-targets: # directories to be removed by `dbt clean`
- - "target"
- - "dbt_packages"
-
- models:
- snowflake_dbt_python_formula1:
- staging:
-
- +docs:
- node_color: "CadetBlue"
- marts:
- +materialized: table
- aggregates:
- +docs:
- node_color: "Maroon"
- +tags: "bi"
-
- core:
- +docs:
- node_color: "#800080"
- intermediate:
- +docs:
- node_color: "MediumSlateBlue"
- ml:
- prep:
- +docs:
- node_color: "Indigo"
- train_predict:
- +docs:
- node_color: "#36454f"
-
- ```
-
-4. The key configurations to point out in the file with relation to the work that we're going to do are in the `models` section.
- - `require-dbt-version` — Tells dbt which version of dbt to use for your project. We are requiring 1.3.0 and any newer version to run python models and node colors.
- - `materialized` — Tells dbt how to materialize models when compiling the code before it pushes it down to Snowflake. All models in the `marts` folder will be built as tables.
- - `tags` — Applies tags at a directory level to all models. All models in the `aggregates` folder will be tagged as `bi` (abbreviation for business intelligence).
- - `docs` — Specifies the `node_color` either by the plain color name or a hex value.
-5. [Materializations](/docs/build/materializations) are strategies for persisting dbt models in a warehouse, with `tables` and `views` being the most commonly utilized types. By default, all dbt models are materialized as views and other materialization types can be configured in the `dbt_project.yml` file or in a model itself. It’s very important to note *Python models can only be materialized as tables or incremental models.* Since all our Python models exist under `marts`, the following portion of our `dbt_project.yml` ensures no errors will occur when we run our Python models. Starting with [dbt version 1.4](/guides/migration/versions/upgrading-to-v1.4#updates-to-python-models), Python files will automatically get materialized as tables even if not explicitly specified.
-
- ```yaml
- marts:
- +materialized: table
- ```
-
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md
deleted file mode 100644
index a47a3b54d48..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md
+++ /dev/null
@@ -1,27 +0,0 @@
----
-title: "Folder structure"
-id: "7-folder-structure"
-description: "Folder structure"
----
-dbt Labs has developed a [project structure guide](/guides/best-practices/how-we-structure/1-guide-overview/) that contains a number of recommendations for how to build the folder structure for your project. Do check out that guide if you want to learn more. Right now we are going to create some folders to organize our files:
-
-- Sources — This is our Formula 1 dataset and it will be defined in a source YAML file.
-- Staging models — These models have a 1:1 with their source table.
-- Intermediate — This is where we will be joining some Formula staging models.
-- Marts models — Here is where we perform our major transformations. It contains these subfolders:
- - aggregates
- - core
- - ml
-1. In your file tree, use your cursor and hover over the `models` subdirectory, click the three dots **…** that appear to the right of the folder name, then select **Create Folder**. We're going to add two new folders to the file path, `staging` and `formula1` (in that order) by typing `staging/formula1` into the file path.
-
-
-
-
- - If you click into your `models` directory now, you should see the new `staging` folder nested within `models` and the `formula1` folder nested within `staging`.
-2. Create two additional folders the same as the last step. Within the `models` subdirectory, create new directories `marts/core`.
-
-3. We will need to create a few more folders and subfolders using the UI. After you create all the necessary folders, your folder tree should look like this when it's all done:
-
-
-
-Remember you can always reference the entire project in [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1) to view the complete folder and file strucutre.
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md
deleted file mode 100644
index 22e49c8a30b..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md
+++ /dev/null
@@ -1,334 +0,0 @@
----
-title: "Sources and staging"
-id: "8-sources-and-staging"
-description: "Sources and staging"
----
-
-In this section, we are going to create our source and staging models.
-
-Sources allow us to create a dependency between our source database object and our staging models which will help us when we look at later. Also, if your source changes database or schema, you only have to update it in your `f1_sources.yml` file rather than updating all of the models it might be used in.
-
-Staging models are the base of our project, where we bring all the individual components we're going to use to build our more complex and useful models into the project.
-
-Since we want to focus on dbt and Python in this workshop, check out our [sources](/docs/build/sources) and [staging](/guides/best-practices/how-we-structure/2-staging) docs if you want to learn more (or take our [dbt Fundamentals](https://courses.getdbt.com/collections) course which covers all of our core functionality).
-
-## Create sources
-
-We're going to be using each of our 8 Formula 1 tables from our `formula1` database under the `raw` schema for our transformations and we want to create those tables as sources in our project.
-
-1. Create a new file called `f1_sources.yml` with the following file path: `models/staging/formula1/f1_sources.yml`.
-2. Then, paste the following code into the file before saving it:
-
-```yaml
-version: 2
-
-sources:
- - name: formula1
- description: formula 1 datasets with normalized tables
- database: formula1
- schema: raw
- tables:
- - name: circuits
- description: One record per circuit, which is the specific race course.
- columns:
- - name: circuitid
- tests:
- - unique
- - not_null
- - name: constructors
- description: One record per constructor. Constructors are the teams that build their formula 1 cars.
- columns:
- - name: constructorid
- tests:
- - unique
- - not_null
- - name: drivers
- description: One record per driver. This table gives details about the driver.
- columns:
- - name: driverid
- tests:
- - unique
- - not_null
- - name: lap_times
- description: One row per lap in each race. Lap times started being recorded in this dataset in 1984 and joined through driver_id.
- - name: pit_stops
- description: One row per pit stop. Pit stops do not have their own id column, the combination of the race_id and driver_id identify the pit stop.
- columns:
- - name: stop
- tests:
- - accepted_values:
- values: [1,2,3,4,5,6,7,8]
- quote: false
- - name: races
- description: One race per row. Importantly this table contains the race year to understand trends.
- columns:
- - name: raceid
- tests:
- - unique
- - not_null
- - name: results
- columns:
- - name: resultid
- tests:
- - unique
- - not_null
- description: One row per result. The main table that we join out for grid and position variables.
- - name: status
- description: One status per row. The status contextualizes whether the race was finished or what issues arose e.g. collisions, engine, etc.
- columns:
- - name: statusid
- tests:
- - unique
- - not_null
-```
-
-## Create staging models
-
-The next step is to set up the staging models for each of the 8 source tables. Given the one-to-one relationship between staging models and their corresponding source tables, we'll build 8 staging models here. We know it’s a lot and in the future, we will seek to update the workshop to make this step less repetitive and more efficient. This step is also a good representation of the real world of data, where you have multiple hierarchical tables that you will need to join together!
-
-1. Let's go in alphabetical order to easily keep track of all our staging models! Create a new file called `stg_f1_circuits.sql` with this file path `models/staging/formula1/stg_f1_circuits.sql`. Then, paste the following code into the file before saving it:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','circuits') }}
-
- ),
-
- renamed as (
- select
- circuitid as circuit_id,
- circuitref as circuit_ref,
- name as circuit_name,
- location,
- country,
- lat as latitude,
- lng as longitude,
- alt as altitude
- -- omit the url
- from source
- )
- select * from renamed
- ```
-
- All we're doing here is pulling the source data into the model using the `source` function, renaming some columns, and omitting the column `url` with a commented note since we don’t need it for our analysis.
-
-1. Create `stg_f1_constructors.sql` with this file path `models/staging/formula1/stg_f1_constructors.sql`. Paste the following code into it before saving the file:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','constructors') }}
-
- ),
-
- renamed as (
- select
- constructorid as constructor_id,
- constructorref as constructor_ref,
- name as constructor_name,
- nationality as constructor_nationality
- -- omit the url
- from source
- )
-
- select * from renamed
- ```
-
- We have 6 other stages models to create. We can do this by creating new files, then copy and paste the code into our `staging` folder.
-
-1. Create `stg_f1_drivers.sql` with this file path `models/staging/formula1/stg_f1_drivers.sql`:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','drivers') }}
-
- ),
-
- renamed as (
- select
- driverid as driver_id,
- driverref as driver_ref,
- number as driver_number,
- code as driver_code,
- forename,
- surname,
- dob as date_of_birth,
- nationality as driver_nationality
- -- omit the url
- from source
- )
-
- select * from renamed
- ```
-1. Create `stg_f1_lap_times.sql` with this file path `models/staging/formula1/stg_f1_lap_times.sql`:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','lap_times') }}
-
- ),
-
- renamed as (
- select
- raceid as race_id,
- driverid as driver_id,
- lap,
- position,
- time as lap_time_formatted,
- milliseconds as lap_time_milliseconds
- from source
- )
-
- select * from renamed
- ```
-1. Create `stg_f1_pit_stops.sql` with this file path `models/staging/formula1/stg_f1_pit_stops.sql`:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','pit_stops') }}
-
- ),
-
- renamed as (
- select
- raceid as race_id,
- driverid as driver_id,
- stop as stop_number,
- lap,
- time as lap_time_formatted,
- duration as pit_stop_duration_seconds,
- milliseconds as pit_stop_milliseconds
- from source
- )
-
- select * from renamed
- order by pit_stop_duration_seconds desc
- ```
-
-1. Create ` stg_f1_races.sql` with this file path `models/staging/formula1/stg_f1_races.sql`:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','races') }}
-
- ),
-
- renamed as (
- select
- raceid as race_id,
- year as race_year,
- round as race_round,
- circuitid as circuit_id,
- name as circuit_name,
- date as race_date,
- to_time(time) as race_time,
- -- omit the url
- fp1_date as free_practice_1_date,
- fp1_time as free_practice_1_time,
- fp2_date as free_practice_2_date,
- fp2_time as free_practice_2_time,
- fp3_date as free_practice_3_date,
- fp3_time as free_practice_3_time,
- quali_date as qualifying_date,
- quali_time as qualifying_time,
- sprint_date,
- sprint_time
- from source
- )
-
- select * from renamed
- ```
-1. Create `stg_f1_results.sql` with this file path `models/staging/formula1/stg_f1_results.sql`:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','results') }}
-
- ),
-
- renamed as (
- select
- resultid as result_id,
- raceid as race_id,
- driverid as driver_id,
- constructorid as constructor_id,
- number as driver_number,
- grid,
- position::int as position,
- positiontext as position_text,
- positionorder as position_order,
- points,
- laps,
- time as results_time_formatted,
- milliseconds as results_milliseconds,
- fastestlap as fastest_lap,
- rank as results_rank,
- fastestlaptime as fastest_lap_time_formatted,
- fastestlapspeed::decimal(6,3) as fastest_lap_speed,
- statusid as status_id
- from source
- )
-
- select * from renamed
- ```
-1. Last one! Create `stg_f1_status.sql` with this file path: `models/staging/formula1/stg_f1_status.sql`:
-
- ```sql
- with
-
- source as (
-
- select * from {{ source('formula1','status') }}
-
- ),
-
- renamed as (
- select
- statusid as status_id,
- status
- from source
- )
-
- select * from renamed
- ```
- After the source and all the staging models are complete for each of the 8 tables, your staging folder should look like this:
-
-
-
-1. It’s a good time to delete our example folder since these two models are extraneous to our formula1 pipeline and `my_first_model` fails a `not_null` test that we won’t spend time investigating. dbt Cloud will warn us that this folder will be permanently deleted, and we are okay with that so select **Delete**.
-
-
-
-1. Now that the staging models are built and saved, it's time to create the models in our development schema in Snowflake. To do this we're going to enter into the command line `dbt build` to run all of the models in our project, which includes the 8 new staging models and the existing example models.
-
- Your run should complete successfully and you should see green checkmarks next to all of your models in the run results. We built our 8 staging models as views and ran 13 source tests that we configured in the `f1_sources.yml` file with not that much code, pretty cool!
-
-
-
- Let's take a quick look in Snowflake, refresh database objects, open our development schema, and confirm that the new models are there. If you can see them, then we're good to go!
-
-
-
- Before we move onto the next section, be sure to commit your new models to your Git branch. Click **Commit and push** and give your commit a message like `profile, sources, and staging setup` before moving on.
-
-
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md
deleted file mode 100644
index 262bf0e5e52..00000000000
--- a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md
+++ /dev/null
@@ -1,299 +0,0 @@
----
-title: "SQL transformations"
-id: "9-sql-transformations"
-description: "SQL transformations"
----
-
-Now that we have all our sources and staging models done, it's time to move into where dbt shines — transformation!
-
-We need to:
-
-- Create some intermediate tables to join tables that aren’t hierarchical
-- Create core tables for business intelligence (BI) tool ingestion
-- Answer the two questions about:
- - fastest pit stops
- - lap time trends about our Formula 1 data by creating aggregate models using python!
-
-## Intermediate models
-
-We need to join lots of reference tables to our results table to create a human readable dataframe. What does this mean? For example, we don’t only want to have the numeric `status_id` in our table, we want to be able to read in a row of data that a driver could not finish a race due to engine failure (`status_id=5`).
-
-By now, we are pretty good at creating new files in the correct directories so we won’t cover this in detail. All intermediate models should be created in the path `models/intermediate`.
-
-1. Create a new file called `int_lap_times_years.sql`. In this model, we are joining our lap time and race information so we can look at lap times over years. In earlier Formula 1 eras, lap times were not recorded (only final results), so we filter out records where lap times are null.
-
- ```sql
- with lap_times as (
-
- select * from {{ ref('stg_f1_lap_times') }}
-
- ),
-
- races as (
-
- select * from {{ ref('stg_f1_races') }}
-
- ),
-
- expanded_lap_times_by_year as (
- select
- lap_times.race_id,
- driver_id,
- race_year,
- lap,
- lap_time_milliseconds
- from lap_times
- left join races
- on lap_times.race_id = races.race_id
- where lap_time_milliseconds is not null
- )
-
- select * from expanded_lap_times_by_year
- ```
-
-2. Create a file called `in_pit_stops.sql`. Pit stops are a many-to-one (M:1) relationship with our races. We are creating a feature called `total_pit_stops_per_race` by partitioning over our `race_id` and `driver_id`, while preserving individual level pit stops for rolling average in our next section.
-
- ```sql
- with stg_f1__pit_stops as
- (
- select * from {{ ref('stg_f1_pit_stops') }}
- ),
-
- pit_stops_per_race as (
- select
- race_id,
- driver_id,
- stop_number,
- lap,
- lap_time_formatted,
- pit_stop_duration_seconds,
- pit_stop_milliseconds,
- max(stop_number) over (partition by race_id,driver_id) as total_pit_stops_per_race
- from stg_f1__pit_stops
- )
-
- select * from pit_stops_per_race
- ```
-
-3. Create a file called `int_results.sql`. Here we are using 4 of our tables — `races`, `drivers`, `constructors`, and `status` — to give context to our `results` table. We are now able to calculate a new feature `drivers_age_years` by bringing the `date_of_birth` and `race_year` into the same table. We are also creating a column to indicate if the driver did not finish (dnf) the race, based upon if their `position` was null called, `dnf_flag`.
-
- ```sql
- with results as (
-
- select * from {{ ref('stg_f1_results') }}
-
- ),
-
- races as (
-
- select * from {{ ref('stg_f1_races') }}
-
- ),
-
- drivers as (
-
- select * from {{ ref('stg_f1_drivers') }}
-
- ),
-
- constructors as (
-
- select * from {{ ref('stg_f1_constructors') }}
- ),
-
- status as (
-
- select * from {{ ref('stg_f1_status') }}
- ),
-
- int_results as (
- select
- result_id,
- results.race_id,
- race_year,
- race_round,
- circuit_id,
- circuit_name,
- race_date,
- race_time,
- results.driver_id,
- results.driver_number,
- forename ||' '|| surname as driver,
- cast(datediff('year', date_of_birth, race_date) as int) as drivers_age_years,
- driver_nationality,
- results.constructor_id,
- constructor_name,
- constructor_nationality,
- grid,
- position,
- position_text,
- position_order,
- points,
- laps,
- results_time_formatted,
- results_milliseconds,
- fastest_lap,
- results_rank,
- fastest_lap_time_formatted,
- fastest_lap_speed,
- results.status_id,
- status,
- case when position is null then 1 else 0 end as dnf_flag
- from results
- left join races
- on results.race_id=races.race_id
- left join drivers
- on results.driver_id = drivers.driver_id
- left join constructors
- on results.constructor_id = constructors.constructor_id
- left join status
- on results.status_id = status.status_id
- )
-
- select * from int_results
- ```
-1. Create a *Markdown* file `intermediate.md` that we will go over in depth during the [Testing](/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) and [Documentation](/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation) sections.
-
- ```markdown
- # the intent of this .md is to allow for multi-line long form explanations for our intermediate transformations
-
- # below are descriptions
- {% docs int_results %} In this query we want to join out other important information about the race results to have a human readable table about results, races, drivers, constructors, and status.
- We will have 4 left joins onto our results table. {% enddocs %}
-
- {% docs int_pit_stops %} There are many pit stops within one race, aka a M:1 relationship.
- We want to aggregate this so we can properly join pit stop information without creating a fanout. {% enddocs %}
-
- {% docs int_lap_times_years %} Lap times are done per lap. We need to join them out to the race year to understand yearly lap time trends. {% enddocs %}
- ```
-1. Create a *YAML* file `intermediate.yml` that we will go over in depth during the [Testing](/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) and [Documentation](/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation) sections.
-
- ```yaml
- version: 2
-
- models:
- - name: int_results
- description: '{{ doc("int_results") }}'
- - name: int_pit_stops
- description: '{{ doc("int_pit_stops") }}'
- - name: int_lap_times_years
- description: '{{ doc("int_lap_times_years") }}'
- ```
- That wraps up the intermediate models we need to create our core models!
-
-## Core models
-
-1. Create a file `fct_results.sql`. This is what I like to refer to as the “mega table” — a really large denormalized table with all our context added in at row level for human readability. Importantly, we have a table `circuits` that is linked through the table `races`. When we joined `races` to `results` in `int_results.sql` we allowed our tables to make the connection from `circuits` to `results` in `fct_results.sql`. We are only taking information about pit stops at the result level so our join would not cause a [fanout](https://community.looker.com/technical-tips-tricks-1021/what-is-a-fanout-23327).
-
- ```sql
- with int_results as (
-
- select * from {{ ref('int_results') }}
-
- ),
-
- int_pit_stops as (
- select
- race_id,
- driver_id,
- max(total_pit_stops_per_race) as total_pit_stops_per_race
- from {{ ref('int_pit_stops') }}
- group by 1,2
- ),
-
- circuits as (
-
- select * from {{ ref('stg_f1_circuits') }}
- ),
- base_results as (
- select
- result_id,
- int_results.race_id,
- race_year,
- race_round,
- int_results.circuit_id,
- int_results.circuit_name,
- circuit_ref,
- location,
- country,
- latitude,
- longitude,
- altitude,
- total_pit_stops_per_race,
- race_date,
- race_time,
- int_results.driver_id,
- driver,
- driver_number,
- drivers_age_years,
- driver_nationality,
- constructor_id,
- constructor_name,
- constructor_nationality,
- grid,
- position,
- position_text,
- position_order,
- points,
- laps,
- results_time_formatted,
- results_milliseconds,
- fastest_lap,
- results_rank,
- fastest_lap_time_formatted,
- fastest_lap_speed,
- status_id,
- status,
- dnf_flag
- from int_results
- left join circuits
- on int_results.circuit_id=circuits.circuit_id
- left join int_pit_stops
- on int_results.driver_id=int_pit_stops.driver_id and int_results.race_id=int_pit_stops.race_id
- )
-
- select * from base_results
- ```
-
-1. Create the file `pit_stops_joined.sql`. Our results and pit stops are at different levels of dimensionality (also called grain). Simply put, we have multiple pit stops per a result. Since we are interested in understanding information at the pit stop level with information about race year and constructor, we will create a new table `pit_stops_joined.sql` where each row is per pit stop. Our new table tees up our aggregation in Python.
-
- ```sql
- with base_results as (
-
- select * from {{ ref('fct_results') }}
-
- ),
-
- pit_stops as (
-
- select * from {{ ref('int_pit_stops') }}
-
- ),
-
- pit_stops_joined as (
-
- select
- base_results.race_id,
- race_year,
- base_results.driver_id,
- constructor_id,
- constructor_name,
- stop_number,
- lap,
- lap_time_formatted,
- pit_stop_duration_seconds,
- pit_stop_milliseconds
- from base_results
- left join pit_stops
- on base_results.race_id=pit_stops.race_id and base_results.driver_id=pit_stops.driver_id
- )
- select * from pit_stops_joined
- ```
-
-1. Enter in the command line and execute `dbt build` to build out our entire pipeline to up to this point. Don’t worry about “overriding” your previous models – dbt workflows are designed to be idempotent so we can run them again and expect the same results.
-
-1. Let’s talk about our lineage so far. It’s looking good 😎. We’ve shown how SQL can be used to make data type, column name changes, and handle hierarchical joins really well; all while building out our automated lineage!
-
-
-
-1. Time to **Commit and push** our changes and give your commit a message like `intermediate and fact models` before moving on.
diff --git a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md b/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md
deleted file mode 100644
index f2fffd43994..00000000000
--- a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md
+++ /dev/null
@@ -1,660 +0,0 @@
----
-title: "dbt Semantic Layer integration"
-id: "sl-partner-integration-guide"
-description: Learn about partner integration guidelines, roadmap, and connectivity.
----
-
-# dbt Semantic Layer partner integration
-
-:::info Coming soon
-The dbt Semantic Layer is undergoing some sophisticated changes, enabling more complex metric definitions and efficient querying. As part of these changes, the dbt_metrics package will be deprecated and replaced with MetricFlow. For more info, check out the [The dbt Semantic Layer: what's next?](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/) and [dbt_metrics deprecation](https://docs.getdbt.com/blog/deprecating-dbt-metrics) blog.
-:::
-
-This guide is for dbt Semantic Layer integration partners and explains integration guidelines, and connectivity.
-
-To become a formal partner, integrate with the API, or have questions/feedback — **[contact us](mailto:semantic-layer@dbtlabs.com)** for more info.
-
-The dbt Semantic Layer allows users to dynamically generate and query datasets in downstream tools based on their dbt governed assets, such as metrics, models, and entities. It helps organizations manage complexities such as data, tools, and teams to make more efficient and trustworthy decisions.
-
-The rapid growth of different tools in the modern data stack has helped data professionals address the diverse needs of different teams. The downside of this growth is the fragmentation of business logic across teams, tools, and workloads.
-
-To solve this, the dbt Semantic Layer provides a platform where users can confidently leverage their data from within their tools. dbt Cloud's change management capabilities ensure that any user modifications made to core business constructs, like metrics or entities, are distributed into all the tools connected to the data platform.
-
-The dbt Semantic Layer can be used for a variety of tools and applications of data. Here are some common use cases
-
-* Business intelligence (BI), reporting, and analytics,
-* Data quality and monitoring,
-* Governance and privacy,
-* Data discovery and cataloging,
-* Machine learning and data science.
-
-:::info Share your use case
-
-If you'd like to share other use cases for the dbt Semantic Layer, contact the [dbt Labs team](mailto:semantic-layer@dbtlabs.com).
-
-:::
-
-
-## Product overview
-
-The dbt Semantic Layer product architecture includes four primary components:
-
-| Components | Information | Developer plans | Team plans | Enterprise plans | License |
-| --- | --- | :---: | :---: | :---: | --- |
-| **[dbt Project](/docs/build/metrics)** | Define models and metrics in dbt Core. | ✅ | ✅ | ✅ | Open source in dbt Core |
-| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt Core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL |
-| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja-like query models and metrics, use macros), compiles the query into pure SQL, executes the query in the data platform, and returns the data. | ✅
_* Available during Public Preview only_ | ✅ | ✅ | Proprietary in dbt Cloud |
-| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, and metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary in dbt Cloud |
-
-Review the following current architecture to understand how the components work together:
-
-
-
-
-## Integration guidelines
-
-In collaboration with dbt Labs, partners and users can build dbt Semantic Layer integrations that can import model metadata and metric definitions, query metrics, use macros, and more.
-
-For more details, refer to the [Integration roadmap](#integration) and [Integration best practices](#best-practices) guidance.
-
-**Integration roadmap **
-
-Integration partners generally build and approach their roadmap in the following stages:
-
-| Feature | Info | Availability |
-|----------|-------|:------------:|
-| **Model metadata** | Import/sync model metadata (descriptions, dimensions, test, freshness, and more) via the [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api). | ✅ |
-| **Metric definitions** | Import/sync metric definitions (metric calculation, dimensions, description, and more) via the [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api). | ✅ |
-| **dbt Semantic Layer as a data source** | Connect to the dbt Semantic Layer as a data source (for example, the Snowflake Proxy Server). Users can execute dbt-SQL to query metrics or models and use macros.* | ✅ |
-| **Query metrics** | Query the imported metrics via a metric-centric UI (for example, a user can select a metric, time grain, and dimensions of interest). | ✅ |
-| **Entity definitions** | Import/sync entity definitions (descriptions, dimensions, data types, relationships, metrics, and more) and query entities via the dbt Semantic Layer. | _*Coming soon |
-| **dbt Semantic Layer Connector** | A dedicated connector with the ability to query any data platform supported in dbt Cloud. (Will replace (3).) | _*Coming soon |
-
-_*The coming soon features are expected to launch in 2023.
-
-**Integration best practices **
-
-To build a successful and seamless dbt Semantic Layer integration, it should express the following:
-
-- **Consistent**: Have a consistent user experience (UX) incorporated into existing core user workflows.
-- **Trustworthy**: Treat dbt assets (metrics, models, and entities) as first-class objects and indicate that their definitions and resulting datasets come from dbt Cloud.
-- **Efficient**: Provide a clear advantage over the current approach to setting up metrics and analyses, and finding dimensions/datasets in the tool.
-- **Accessible**: Include a self-serve component so a data consumer can ask questions via the user interface (UI), if applicable.
-
-
-## Use the Discovery API
-
-This section will explain how to connect to and query the [Discovery API](/docs/dbt-cloud-apis/discovery-api) for model and metric definitions.
-
-To use the dbt Semantic Layer, you must meet the [prerequisites](/docs/use-dbt-semantic-layer/dbt-semantic-layer#prerequisites).
-
-
- Discovery API authorization
-
- Refer to our Authorization documentation to learn how to authorize requests to the Discovery API.
-
- Metrics-specific queries work identical to existing Discovery API queries. This means existing integrations that query model metadata will work perfectly in the context of metrics.
-
-
-
-
-
- Query the Discovery API
-
- Test out the Discovery API by using the GraphQL sandbox and use this Python client as a starting point to develop.
-
-
-
-
-
-
-
-### Query models for a project
-
-You can query model definitions or details about a specific model for a project from a given job.
-
-
-
-
-
-
-
-This is an example of querying all models that utilize the schema`analytics` from a given job.
-
-```
-{
- models(jobId: 181329, schema: "analytics") {
- name
- status
- compileCompletedAt
- database
- dbtVersion
- runGeneratedAt
- }
-}
-```
-
-
-
-
-```
-{
- "data": {
- "models": [
- {
- "name": "customers",
- "status": "success",
- "compileCompletedAt": "2022-12-15T06:37:24.186Z",
- "database": "analytics",
- "dbtVersion": "1.3.1",
- "runGeneratedAt": "2022-12-15T06:37:25.187Z"
- },
- {
- "name": "stg_customers",
- "status": "success",
- "compileCompletedAt": "2022-12-15T06:37:22.509Z",
- "database": "analytics",
- "dbtVersion": "1.3.1",
- "runGeneratedAt": "2022-12-15T06:37:25.187Z"
- },
- {
- "name": "stg_orders",
- "status": "success",
- "compileCompletedAt": "2022-12-15T06:37:22.509Z",
- "database": "analytics",
- "dbtVersion": "1.3.1",
- "runGeneratedAt": "2022-12-15T06:37:25.187Z"
- }
- ]
- }
-}
-```
-
-
-
-
-This is an example of querying details about a specific model, `model.jaffle_shop.customers`, from a given job.
-
-```
-{
- model(jobId: 181329, uniqueId: "model.jaffle_shop.customers") {
- parentsModels {
- runId
- uniqueId
- executionTime
- }
- }
-}
-{
- "data": {
- "model": {
- "parentsModels": [
- {
- "runId": 105297555,
- "uniqueId": "model.jaffle_shop.stg_customers",
- "executionTime": 1.676571846008301
- },
- {
- "runId": 105297555,
- "uniqueId": "model.jaffle_shop.stg_orders",
- "executionTime": 1.631831407546997
- }
- ]
- }
- }
-}
-```
-
-
-
-
-
-
-### Query metrics for a project
-
-Query metrics definitions or details for a project from a given job and refer to the following resources:
-
-- [Metrics query](/docs/dbt-cloud-apis/discovery-schema-metrics) — Information on how to query the full list of metrics defined in a user’s project with the dbt Cloud Discovery API.
-- [dbt Metrics docs](https://docs.getdbt.com/docs/build/metrics#available-properties) — Information on the available metric properties.
-- [GraphQL sandbox](https://studio.apollographql.com/sandbox/explorer?endpoint=https%3A%2F%2Fmetadata.cloud.getdbt.com%2Fgraphql) — Access to test the dbt Cloud Discovery API testing environment.
-
-
-
-
-
-
-This is an example listing metrics from a given job:
-
-```
-{
- metrics(jobId: 123) {
- name
- label
- description
- model
- dependsOn
- calculation_method
- expression
- timestamp
- timeGrains
- dimensions
- window
- filters
- tags
- meta
- }
-}
-```
-
-
-
-
-The `metric` query supports all metric properties listed in **Listing metrics**.
-The following abbreviated example is querying details about the metric `new_customers` from job `123`:
-
-This is an example of querying details about a specific metric `new_customers` from a given job `123`.
-
-```
-{
- metric(jobId: 123) {
- label
- calculation_method
- timestamp
- timeGrains
- dimensions
- }
-}
-```
-
-
-
-
-
-```
-{
- "data": {
- "metrics": [
- {
- "uniqueId": "metric.claim_to_fame.total_claim_charges",
- "name": "total_claim_charges",
- "tags": [],
- "label": "Total Claim Charges",
- "calculation_method": "sum",
- "expression": "total_charge_amount",
- "timestamp": "created_at",
- "timeGrains":[
- "day",
- "week",
- "month"
- ],
- "meta": {},
- "resourceType": "metric",
- "model": {
- "name": "fct_billed_patient_claims"
- }
- },
- {
- "uniqueId": "metric.claim_to_fame.total_billed_diagnoses",
- "name": "total_billed_diagnoses",
- "tags": [],
- "label": "Total Billed Diagnoses",
- "calculation_method": "count_distinct",
- "expression": "diagnosis_id",
- "timestamp": "created_at",
- "timeGrains":[
- "week",
- "month",
- "year"
- ],
- "meta": {},
- "resourceType": "metric",
- "model": {
- "name": "fct_billed_patient_claims"
- },
- }
- ]
- }
-}
-```
-
-
-
-
-
-```
-metrics:
- - name: total_claim_charges
- label: Total Claim Charges
- model: ref('fct_billed_patient_claims')
- calculation_method: sum
- expression: total_charge_amount
- timestamp: created_at
- time_grains: [day, week, month, all_time]
-
-
- - name: total_billed_diagnoses
- label: Total Billed Diagnoses
- model: ref('fct_billed_patient_claims')
- calculation_method: count_distinct
- expression: diagnosis_id
- timestamp: created_at
- time_grains: [day, week, month]
-```
-
-
-
-
-
-
-
-
-## Query the dbt Semantic Layer
-
-This section explains how to connect to or query the dbt Semantic Layer Proxy Server to return model data, metric data, and so on.
-
-When you configure the dbt Semantic Layer, dbt Cloud provides a Proxy Server endpoint that users can connect to as though it's a Snowflake-hosted endpoint. Once the queries are submitted, dbt Cloud will:
-
-1. Compile dbt-sql queries into valid Snowflake SQL,
-2. Execute the compiled SQL against the Snowflake data platform,
-3. Return the results to the client.
-
-Replace the hostname in your existing data platform connection with the relevant dbt Cloud Proxy Server URL (for example, `abc123.proxy.cloud.getdbt.com`). All queries you submit through the endpoint will be compiled en route to the data platform.*
-
-*_Note: This approach will change with the new Semantic Layer connection in mid-2023, which will be able to query all data platforms supported in dbt Cloud through dedicated JDBC/ODBC drivers, and eventually an API._
-
-
-
-
-
-
-
-Users can compile and execute metric queries using macros defined in the [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics). This package:
-
-- Generates the SQL required to accurately calculate the metric definition,
-- Supplies helper macros for derived calculations (like month over month, year to date, and so on) time series operations
-
-
-```
-select *
-from {{ metrics.calculate(
- metric_list=[metric('customers'), metric(‘revenue’)],
- grain='week',
- dimensions=['plan', 'country'],
- secondary_calculations=[
- metrics.period_to_date(aggregate="sum", period="year"),
- metrics.rolling(aggregate="average", interval=4, alias="avg_past_4wks")
- ],
- start_date='2020-01-01',
- end_date="date_trunc('day', getdate())"
-) }}
-```
-
-
-
-
-
-Model queries allow users to query models and use macros from their dbt project.
-
-```
-select cents_to_dollars('amount_cents') as amount_dollars
-from {{ ref('orders') }}
-```
-
-
-
-### Entities
-
-
-dbt Labs will introduce a new node type, **[entity](https://github.com/dbt-labs/dbt-core/issues/6379)**, when dbt Core version 1.5 launches. It introduces a new and efficient way to define metrics by reusing logic (for example, `time_grains`).
-
-Entities are semantic objects made up of curated dimensions from models with more metadata defined. Over time, users can standardize metric and entity definitions with packages to speed up development.
-
-For integrations, entities will provide information like:
-
-- a way to organize metrics based on the entity they reference, and
-- a new consumable and dynamically generated dataset (versus finding a table in the data platform).
-
-This information will be available alongside the Discovery API, and entities can be directly queried through the dbt Semantic Layer.
-
-
-
-:::caution 🚧
-
-Entities are a work in progress — expect continuous changes and improvements. To stay up-to-date, refer to the [entity discussions](https://github.com/dbt-labs/dbt-core/issues/6379) page.
-
-:::
-
-
-
-
-
-
-
-Define entities in your dbt project.
-
-```
-entities: ## The top-level path of the new node
- - name: [Required] ## The name of the entity
- model: [Required] ## The name of the model that the entity is dependent on
- description: [Optional] ## The description of the entity
-
- dimensions: [Optional] ## The list of dimensions & properties associated with the entity.
- - include: [Optional] *
- - exclude: [Optional]
- - name: [Required] ## The name of the dimension
- column_name: [Optional] ## The name of the column in the model if not 1:1. Serves as mapping
- data_type: [Optional] ## The data type of the dimension
- description: [Optional] ## Description of the dimension
- default_timestamp: [Optional] ## Setting datetime dimension as default for metrics
- time_grains: [Optional] ## Acceptable time grains for the datetime dimension
- primary_key: [Optional] ## Whether this dimension is part of the primary key
-```
-
-
-
-
-Query entities via the Discovery API.
-
-```
-"entity.project_name.entity_name": {
- "unique_id": "entity.project_name.entity_name",
- "package_name": "project_name",
- "original_file_path": "models/metric_definitions/ratio_metric.yml",
- "name": "entity_name",
- "model": "ref('model_name')",
- "description": "some description",
- "dimensions": {
- "dimension_name": {
- "name": "dimension_name",
- "column_name": "column_name",
- "default_timestamp": "true",
- "time_grains": "[day, week, month, year]"
- "primary_key": true,
- "data_type": null,
- "description": "TBD",
- "meta": {},
- }
- },
- "resource_type": "entity",
- "meta": {},
- "tags": [],
- "config": {
- "enabled": true,
- },
- "depends_on": {
- "macros": [],
- "nodes": [
- "model.project_name.model_name",
- ]
- },
- "docs": {
- "show": true,
- "node_color": null
- },
- "refs": [
- [
- "model_name",
- ]
- ],
- "created_at": 1669653016.522599
- },
- ```
-
-
-
-
-How to define new [metrics](/docs/build/metrics) in your dbt project. The metric definition and metadata response will change accordingly once entities are introduced, notably with metrics referencing entities instead of models and inheriting entity dimensions.
-
- ```
- metrics:
- ## Always required
- - name: [Required] ## The name of the metric
- label: [Required] ## The human-readable name of the metric
- calculation_method: [Required] ## The calculation/aggregation used for the metric
- expression: [Required] ## The SQL expression being aggregated/calculated
- entity: [Required] ## The entity being used as the source of the metric
-
- ## Always optional
- description: [Optional] ## Any description about the metric
- timestamp: [Optional] ## The name of the timestamp field to use
- time_grains: [Optional] ## The list of time grains that are permitted
- filters: [Optional] ## The filters of the metric
- window: [Optional] ## The ability to make a metric cumulative over a time period
- config: [Optional] ## Additional information for configuring the output
-
- ## Either or dimensions:
- include: [Optional] ## The list of dimensions to be included. Either * or list
- exclude: [Optional] ## The list of dimensions to be excluded from the inherited list
- ```
-
-
-
-
-
-```
-"metric.project_name.metric_name": {
- "fqn": [
- "project_name",
- "folder_name",
- "metric_name"
- ],
- "unique_id": "metric.project_name.metric_name",
- "package_name": "project_name",
- "root_path": "file_path",
- "path": "file_path",
- "original_file_path": "file_path",
- "name": "metric_name",
- "description": "description",
- "entity": "entity_name",
- "label": "Human readable version",
- "calculation_method": "the calc method",
- "timestamp": "the timestamp field",
- "time_grains": [
- "day",
- "week"
- ],
- "expression": "a field name or sql expression",
- "dimensions": [
- {
- "entity_name": [
- "had_discount",
- "order_country"
- ]
- }
- ],
- "window": null,
- "resource_type": "metric",
- "filters": [],
- "meta": {},
- "tags": [],
- "config": {
- "enabled": true
- },
- "unrendered_config": {},
- "sources": [],
- "depends_on": {
- "macros": [],
- "nodes": [
- "entity.projet_name.entity_name",
- ]
- },
- "entities": [
- [
- "entity_name"
- ]
- ],
- "metrics": ["used for derived metrics"],
- "created_at": 1669653027.290001
- },
- ```
-
-
-
-
-Query an entity using dbt-SQL. Eventually, users will be able to query entities and dynamically generate datasets using a macro (like with metrics), without having to find specific tables or columns.
-
-```
-select *
-from {{ entities.calculate(
- entity_list=[...], [Required, one to start]
- dimensions: [...], [Optional, default is all]
- metrics: [...], [Optional, default is all at finest grain]
- filters: ...
- )}}
- ```
-
-
-
-### dbt Semantic Layer Connector
-
-In order to support more data platforms and enhance the user experience, users will be able to connect to a [dbt Cloud-supported data platform](/docs/cloud/connect-data-platform/about-connections) with the dbt Semantic Layer.
-
-Integration partners need to install the [Arrow FlightSQL](https://arrow.apache.org/docs/format/FlightSql.html) JDBC/ODBC driver, which will authenticate with dbt Cloud and the data platform that it queries.
-
-
-
-
-
-### dbt Semantic Layer API
-
-dbt Cloud will provide a web API that supports:
-
-- Compiling dbt-SQL queries to return their compiled SQL.
-- Executing dbt-SQL queries and returning the queried results from the data platform.
-
-The API will be a viable integration point with the dbt Semantic Layer. It will be authorized by a [dbt Cloud service token](/docs/dbt-cloud-apis/service-tokens) and eventually support the invocation of dbt commands (e.g., `dbt run`, `dbt test`, etc.) in the future.
-
-
-## Contact us
-
-### For dbt Semantic Layer support
-
-For partner and customer support, please email the [Support team](mailto:support@getdbt.com). Please ensure the message includes:
-
-- "Semantic Layer"
-- The name of the partner software
-- The dbt Cloud account ID of the customer, if you are a partner making the inquiry
-
-### For product and partnerships
-
-If you'd like to become a formal partner, have product feedback/questions, or are interested in integrating, email the [Product and Partnership team](mailto:semantic-layer@dbtlabs.com).
-
-
-
-## Related docs
-
-- [dbt Semantic Layer docs](https://docs.getdbt.com/docs/use-dbt-semantic-layer/dbt-semantic-layer) to learn about the product.
-- [dbt Metrics docs](https://docs.getdbt.com/docs/building-a-dbt-project/metrics) for more information about its components.
-- [dbt Semantic Layer intro blog](https://www.getdbt.com/blog/dbt-semantic-layer/) and [launch blog](https://www.getdbt.com/blog/frontiers-of-the-dbt-semantic-layer/) to learn more about the product vision and purpose.
-- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations.
-
-
diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md b/website/docs/guides/dbt-models-on-databricks.md
similarity index 93%
rename from website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md
rename to website/docs/guides/dbt-models-on-databricks.md
index b5389645258..489a3c28467 100644
--- a/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md
+++ b/website/docs/guides/dbt-models-on-databricks.md
@@ -1,17 +1,26 @@
---
-title: How to optimize and troubleshoot dbt models on Databricks
-sidebar_label: "How to optimize and troubleshoot dbt models on Databricks"
+title: Optimize and troubleshoot dbt models on Databricks
+id: optimize-dbt-models-on-databricks
description: "Learn more about optimizing and troubleshooting your dbt models on Databricks"
+displayText: Optimizing and troubleshooting your dbt models on Databricks
+hoverSnippet: Learn how to optimize and troubleshoot your dbt models on Databricks.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'databricks'
+hide_table_of_contents: true
+tags: ['Databricks', 'dbt Core','dbt Cloud']
+level: 'Intermediate'
+recently_updated: true
---
+## Introduction
-Continuing our Databricks and dbt guide series from the last [guide](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project), it’s time to talk about performance optimization. In this follow-up post, we outline simple strategies to optimize for cost, performance, and simplicity when architecting your data pipelines. We’ve encapsulated these strategies in this acronym-framework:
+Building on the [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project) guide, we'd like to discuss performance optimization. In this follow-up post, we outline simple strategies to optimize for cost, performance, and simplicity when you architect data pipelines. We’ve encapsulated these strategies in this acronym-framework:
- Platform Components
- Patterns & Best Practices
- Performance Troubleshooting
-## 1. Platform Components
+## Platform Components
As you start to develop your dbt projects, one of the first decisions you will make is what kind of backend infrastructure to run your models against. Databricks offers SQL warehouses, All-Purpose Compute, and Jobs Compute, each optimized to workloads they are catered to. Our recommendation is to use Databricks SQL warehouses for all your SQL workloads. SQL warehouses are optimized for SQL workloads when compared to other compute options, additionally, they can scale both vertically to support larger workloads and horizontally to support concurrency. Also, SQL warehouses are easier to manage and provide out-of-the-box features such as query history to help audit and optimize your SQL workloads. Between Serverless, Pro, and Classic SQL Warehouse types that Databricks offers, our standard recommendation for you is to leverage Databricks serverless warehouses. You can explore features of these warehouse types in the [Compare features section](https://www.databricks.com/product/pricing/databricks-sql?_gl=1*2rsmlo*_ga*ZmExYzgzZDAtMWU0Ny00N2YyLWFhYzEtM2RhZTQzNTAyZjZi*_ga_PQSEQ3RZQC*MTY3OTYwMDg0Ni4zNTAuMS4xNjc5NjAyMDMzLjUzLjAuMA..&_ga=2.104593536.1471430337.1679342371-fa1c83d0-1e47-47f2-aac1-3dae43502f6b) on the Databricks pricing page.
@@ -31,11 +40,11 @@ Another technique worth implementing is to provision separate SQL warehouses for
Because of the ability of serverless warehouses to spin up in a matter of seconds, setting your auto-stop configuration to a lower threshold will not impact SLAs and end-user experience. From the SQL Workspace UI, the default value is 10 minutes and you can set it to 5 minutes for a lower threshold with the UI. If you would like more custom settings, you can set the threshold to as low as 1 minute with the [API](https://docs.databricks.com/sql/api/sql-endpoints.html#).
-## 2. Patterns & Best Practices
+## Patterns & Best Practices
Now that we have a solid sense of the infrastructure components, we can shift our focus to best practices and design patterns on pipeline development. We recommend the staging/intermediate/mart approach which is analogous to the medallion architecture bronze/silver/gold approach that’s recommended by Databricks. Let’s dissect each stage further.
-dbt has guidelines on how you can [structure your dbt project](/guides/best-practices/how-we-structure/1-guide-overview) which you can learn more about.
+dbt has guidelines on how you can [structure your dbt project](/best-practices/how-we-structure/1-guide-overview) which you can learn more about.
### Bronze / Staging Layer:
@@ -49,7 +58,7 @@ The main benefit of leveraging `COPY INTO` is that it's an incremental operation
Now that we have our bronze table taken care of, we can proceed with the silver layer.
-For cost and performance reasons, many customers opt to implement an incremental pipeline approach. The main benefit with this approach is that you process a lot less data when you insert new records into the silver layer, rather than re-create the table each time with all the data from the bronze layer. However it should be noted that by default, [dbt recommends using views and tables](/guides/best-practices/materializations/1-guide-overview) to start out with and then moving to incremental as you require more performance optimization.
+For cost and performance reasons, many customers opt to implement an incremental pipeline approach. The main benefit with this approach is that you process a lot less data when you insert new records into the silver layer, rather than re-create the table each time with all the data from the bronze layer. However it should be noted that by default, [dbt recommends using views and tables](/best-practices/materializations/1-guide-overview) to start out with and then moving to incremental as you require more performance optimization.
dbt has an [incremental model materialization](/reference/resource-configs/spark-configs#the-merge-strategy) to facilitate this framework. How this works at a high level is that Databricks will create a temp view with a snapshot of data and then merge that snapshot into the silver table. You can customize the time range of the snapshot to suit your specific use case by configuring the `where` conditional in your `is_incremental` logic. The most straightforward implementation is to merge data using a timestamp that’s later than the current max timestamp in the silver table, but there are certainly valid use cases for increasing the temporal range of the source snapshot.
@@ -121,7 +130,7 @@ incremental_predicates = [
}}
```
-## 3. Performance Troubleshooting
+## Performance Troubleshooting
Performance troubleshooting refers to the process of identifying and resolving issues that impact the performance of your dbt models and overall data pipelines. By improving the speed and performance of your Lakehouse platform, you will be able to process data faster, process large and complex queries more effectively, and provide faster time to market. Let’s go into detail the three effective strategies that you can implement.
@@ -166,8 +175,8 @@ Now you might be wondering, how do you identify opportunities for performance im
With the [dbt Cloud Admin API](/docs/dbt-cloud-apis/admin-cloud-api), you can pull the dbt artifacts from your dbt Cloud run, put the generated `manifest.json` into an S3 bucket, stage it, and model the data using the [dbt artifacts package](https://hub.getdbt.com/brooklyn-data/dbt_artifacts/latest/). That package can help you identify inefficiencies in your dbt models and pinpoint where opportunities for improvement are.
-## Conclusion
+### Conclusion
-This concludes the second guide in our series on “Working with Databricks and dbt”, following [How to set up your Databricks and dbt Project](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project).
+This builds on the content in [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project).
We welcome you to try these strategies on our example open source TPC-H implementation and to provide us with thoughts/feedback as you start to incorporate these features into production. Looking forward to your feedback on [#db-databricks-and-spark](https://getdbt.slack.com/archives/CNGCW8HKL) Slack channel!
diff --git a/website/docs/guides/dbt-python-snowpark.md b/website/docs/guides/dbt-python-snowpark.md
new file mode 100644
index 00000000000..55e6b68c172
--- /dev/null
+++ b/website/docs/guides/dbt-python-snowpark.md
@@ -0,0 +1,1925 @@
+---
+title: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake"
+id: "dbt-python-snowpark"
+description: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake"
+hoverSnippet: Learn how to leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Snowflake']
+level: 'Intermediate'
+recently_updated: true
+---
+
+## Introduction
+
+The focus of this workshop will be to demonstrate how we can use both *SQL and python together* in the same workflow to run *both analytics and machine learning models* on dbt Cloud.
+
+All code in today’s workshop can be found on [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1).
+
+### What you'll use during the lab
+
+- A [Snowflake account](https://trial.snowflake.com/) with ACCOUNTADMIN access
+- A [dbt Cloud account](https://www.getdbt.com/signup/)
+
+### What you'll learn
+
+- How to build scalable data transformation pipelines using dbt, and Snowflake using SQL and Python
+- How to leverage copying data into Snowflake from a public S3 bucket
+
+### What you need to know
+
+- Basic to intermediate SQL and python.
+- Basic understanding of dbt fundamentals. We recommend the [dbt Fundamentals course](https://courses.getdbt.com/collections) if you're interested.
+- High level machine learning process (encoding, training, testing)
+- Simple ML algorithms — we will use logistic regression to keep the focus on the *workflow*, not algorithms!
+
+### What you'll build
+
+- A set of data analytics and prediction pipelines using Formula 1 data leveraging dbt and Snowflake, making use of best practices like data quality tests and code promotion between environments
+- We will create insights for:
+ 1. Finding the lap time average and rolling average through the years (is it generally trending up or down)?
+ 2. Which constructor has the fastest pit stops in 2021?
+ 3. Predicting the position of each driver given using a decade of data (2010 - 2020)
+
+As inputs, we are going to leverage Formula 1 datasets hosted on a dbt Labs public S3 bucket. We will create a Snowflake Stage for our CSV files then use Snowflake’s `COPY INTO` function to copy the data in from our CSV files into tables. The Formula 1 is available on [Kaggle](https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020). The data is originally compiled from the [Ergast Developer API](http://ergast.com/mrd/).
+
+Overall we are going to set up the environments, build scalable pipelines in dbt, establish data tests, and promote code to production.
+
+## Configure Snowflake
+
+1. Log in to your trial Snowflake account. You can [sign up for a Snowflake Trial Account using this form](https://signup.snowflake.com/) if you don’t have one.
+2. Ensure that your account is set up using **AWS** in the **US East (N. Virginia)**. We will be copying the data from a public AWS S3 bucket hosted by dbt Labs in the us-east-1 region. By ensuring our Snowflake environment setup matches our bucket region, we avoid any multi-region data copy and retrieval latency issues.
+
+
+
+3. After creating your account and verifying it from your sign-up email, Snowflake will direct you back to the UI called Snowsight.
+
+4. When Snowsight first opens, your window should look like the following, with you logged in as the ACCOUNTADMIN with demo worksheets open:
+
+
+
+5. Navigate to **Admin > Billing & Terms**. Click **Enable > Acknowledge & Continue** to enable Anaconda Python Packages to run in Snowflake.
+
+
+
+
+
+6. Finally, create a new Worksheet by selecting **+ Worksheet** in the upper right corner.
+
+## Connect to data source
+
+We need to obtain our data source by copying our Formula 1 data into Snowflake tables from a public S3 bucket that dbt Labs hosts.
+
+1. When a new Snowflake account is created, there should be a preconfigured warehouse in your account named `COMPUTE_WH`.
+2. If for any reason your account doesn’t have this warehouse, we can create a warehouse using the following script:
+
+ ```sql
+ create or replace warehouse COMPUTE_WH with warehouse_size=XSMALL
+ ```
+
+3. Rename the worksheet to `data setup script` since we will be placing code in this worksheet to ingest the Formula 1 data. Make sure you are still logged in as the **ACCOUNTADMIN** and select the **COMPUTE_WH** warehouse.
+
+
+
+4. Copy the following code into the main body of the Snowflake worksheet. You can also find this setup script under the `setup` folder in the [Git repository](https://github.com/dbt-labs/python-snowpark-formula1/blob/main/setup/setup_script_s3_to_snowflake.sql). The script is long since it's bring in all of the data we'll need today!
+
+ ```sql
+ -- create and define our formula1 database
+ create or replace database formula1;
+ use database formula1;
+ create or replace schema raw;
+ use schema raw;
+
+ -- define our file format for reading in the csvs
+ create or replace file format csvformat
+ type = csv
+ field_delimiter =','
+ field_optionally_enclosed_by = '"',
+ skip_header=1;
+
+ --
+ create or replace stage formula1_stage
+ file_format = csvformat
+ url = 's3://formula1-dbt-cloud-python-demo/formula1-kaggle-data/';
+
+ -- load in the 8 tables we need for our demo
+ -- we are first creating the table then copying our data in from s3
+ -- think of this as an empty container or shell that we are then filling
+ create or replace table formula1.raw.circuits (
+ CIRCUITID NUMBER(38,0),
+ CIRCUITREF VARCHAR(16777216),
+ NAME VARCHAR(16777216),
+ LOCATION VARCHAR(16777216),
+ COUNTRY VARCHAR(16777216),
+ LAT FLOAT,
+ LNG FLOAT,
+ ALT NUMBER(38,0),
+ URL VARCHAR(16777216)
+ );
+ -- copy our data from public s3 bucket into our tables
+ copy into circuits
+ from @formula1_stage/circuits.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.constructors (
+ CONSTRUCTORID NUMBER(38,0),
+ CONSTRUCTORREF VARCHAR(16777216),
+ NAME VARCHAR(16777216),
+ NATIONALITY VARCHAR(16777216),
+ URL VARCHAR(16777216)
+ );
+ copy into constructors
+ from @formula1_stage/constructors.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.drivers (
+ DRIVERID NUMBER(38,0),
+ DRIVERREF VARCHAR(16777216),
+ NUMBER VARCHAR(16777216),
+ CODE VARCHAR(16777216),
+ FORENAME VARCHAR(16777216),
+ SURNAME VARCHAR(16777216),
+ DOB DATE,
+ NATIONALITY VARCHAR(16777216),
+ URL VARCHAR(16777216)
+ );
+ copy into drivers
+ from @formula1_stage/drivers.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.lap_times (
+ RACEID NUMBER(38,0),
+ DRIVERID NUMBER(38,0),
+ LAP NUMBER(38,0),
+ POSITION FLOAT,
+ TIME VARCHAR(16777216),
+ MILLISECONDS NUMBER(38,0)
+ );
+ copy into lap_times
+ from @formula1_stage/lap_times.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.pit_stops (
+ RACEID NUMBER(38,0),
+ DRIVERID NUMBER(38,0),
+ STOP NUMBER(38,0),
+ LAP NUMBER(38,0),
+ TIME VARCHAR(16777216),
+ DURATION VARCHAR(16777216),
+ MILLISECONDS NUMBER(38,0)
+ );
+ copy into pit_stops
+ from @formula1_stage/pit_stops.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.races (
+ RACEID NUMBER(38,0),
+ YEAR NUMBER(38,0),
+ ROUND NUMBER(38,0),
+ CIRCUITID NUMBER(38,0),
+ NAME VARCHAR(16777216),
+ DATE DATE,
+ TIME VARCHAR(16777216),
+ URL VARCHAR(16777216),
+ FP1_DATE VARCHAR(16777216),
+ FP1_TIME VARCHAR(16777216),
+ FP2_DATE VARCHAR(16777216),
+ FP2_TIME VARCHAR(16777216),
+ FP3_DATE VARCHAR(16777216),
+ FP3_TIME VARCHAR(16777216),
+ QUALI_DATE VARCHAR(16777216),
+ QUALI_TIME VARCHAR(16777216),
+ SPRINT_DATE VARCHAR(16777216),
+ SPRINT_TIME VARCHAR(16777216)
+ );
+ copy into races
+ from @formula1_stage/races.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.results (
+ RESULTID NUMBER(38,0),
+ RACEID NUMBER(38,0),
+ DRIVERID NUMBER(38,0),
+ CONSTRUCTORID NUMBER(38,0),
+ NUMBER NUMBER(38,0),
+ GRID NUMBER(38,0),
+ POSITION FLOAT,
+ POSITIONTEXT VARCHAR(16777216),
+ POSITIONORDER NUMBER(38,0),
+ POINTS NUMBER(38,0),
+ LAPS NUMBER(38,0),
+ TIME VARCHAR(16777216),
+ MILLISECONDS NUMBER(38,0),
+ FASTESTLAP NUMBER(38,0),
+ RANK NUMBER(38,0),
+ FASTESTLAPTIME VARCHAR(16777216),
+ FASTESTLAPSPEED FLOAT,
+ STATUSID NUMBER(38,0)
+ );
+ copy into results
+ from @formula1_stage/results.csv
+ on_error='continue';
+
+ create or replace table formula1.raw.status (
+ STATUSID NUMBER(38,0),
+ STATUS VARCHAR(16777216)
+ );
+ copy into status
+ from @formula1_stage/status.csv
+ on_error='continue';
+
+ ```
+
+5. Ensure all the commands are selected before running the query — an easy way to do this is to use Ctrl-a to highlight all of the code in the worksheet. Select **run** (blue triangle icon). Notice how the dot next to your **COMPUTE_WH** turns from gray to green as you run the query. The **status** table is the final table of all 8 tables loaded in.
+
+
+
+6. Let’s unpack that pretty long query we ran into component parts. We ran this query to load in our 8 Formula 1 tables from a public S3 bucket. To do this, we:
+ - Created a new database called `formula1` and a schema called `raw` to place our raw (untransformed) data into.
+ - Defined our file format for our CSV files. Importantly, here we use a parameter called `field_optionally_enclosed_by =` since the string columns in our Formula 1 csv files use quotes. Quotes are used around string values to avoid parsing issues where commas `,` and new lines `/n` in data values could cause data loading errors.
+ - Created a stage to locate our data we are going to load in. Snowflake Stages are locations where data files are stored. Stages are used to both load and unload data to and from Snowflake locations. Here we are using an external stage, by referencing an S3 bucket.
+ - Created our tables for our data to be copied into. These are empty tables with the column name and data type. Think of this as creating an empty container that the data will then fill into.
+ - Used the `copy into` statement for each of our tables. We reference our staged location we created and upon loading errors continue to load in the rest of the data. You should not have data loading errors but if you do, those rows will be skipped and Snowflake will tell you which rows caused errors
+
+7. Now let's take a look at some of our cool Formula 1 data we just loaded up!
+ 1. Create a new worksheet by selecting the **+** then **New Worksheet**.
+
+ 2. Navigate to **Database > Formula1 > RAW > Tables**.
+ 3. Query the data using the following code. There are only 76 rows in the circuits table, so we don’t need to worry about limiting the amount of data we query.
+
+ ```sql
+ select * from formula1.raw.circuits
+ ```
+
+ 4. Run the query. From here on out, we’ll use the keyboard shortcuts Command-Enter or Control-Enter to run queries and won’t explicitly call out this step.
+ 5. Review the query results, you should see information about Formula 1 circuits, starting with Albert Park in Australia!
+ 6. Finally, ensure you have all 8 tables starting with `CIRCUITS` and ending with `STATUS`. Now we are ready to connect into dbt Cloud!
+
+
+
+## Configure dbt Cloud
+
+1. We are going to be using [Snowflake Partner Connect](https://docs.snowflake.com/en/user-guide/ecosystem-partner-connect.html) to set up a dbt Cloud account. Using this method will allow you to spin up a fully fledged dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [managed repository](/docs/collaborate/git/managed-repository), environments, and credentials already established.
+2. Navigate out of your worksheet back by selecting **home**.
+3. In Snowsight, confirm that you are using the **ACCOUNTADMIN** role.
+4. Navigate to the **Admin** **> Partner Connect**. Find **dbt** either by using the search bar or navigating the **Data Integration**. Select the **dbt** tile.
+
+5. You should now see a new window that says **Connect to dbt**. Select **Optional Grant** and add the `FORMULA1` database. This will grant access for your new dbt user role to the FORMULA1 database.
+
+
+6. Ensure the `FORMULA1` is present in your optional grant before clicking **Connect**. This will create a dedicated dbt user, database, warehouse, and role for your dbt Cloud trial.
+
+
+
+7. When you see the **Your partner account has been created** window, click **Activate**.
+
+8. You should be redirected to a dbt Cloud registration page. Fill out the form. Make sure to save the password somewhere for login in the future.
+
+
+
+9. Select **Complete Registration**. You should now be redirected to your dbt Cloud account, complete with a connection to your Snowflake account, a deployment and a development environment, and a sample job.
+
+10. To help you version control your dbt project, we have connected it to a [managed repository](/docs/collaborate/git/managed-repository), which means that dbt Labs will be hosting your repository for you. This will give you access to a Git workflow without you having to create and host the repository yourself. You will not need to know Git for this workshop; dbt Cloud will help guide you through the workflow. In the future, when you’re developing your own project, [feel free to use your own repository](/docs/cloud/git/connect-github). This will allow you to learn more about features like [Slim CI](/docs/deploy/continuous-integration) builds after this workshop.
+
+## Change development schema name navigate the IDE
+
+1. First we are going to change the name of our default schema to where our dbt models will build. By default, the name is `dbt_`. We will change this to `dbt_` to create your own personal development schema. To do this, select **Profile Settings** from the gear icon in the upper right.
+
+
+
+2. Navigate to the **Credentials** menu and select **Partner Connect Trial**, which will expand the credentials menu.
+
+
+
+3. Click **Edit** and change the name of your schema from `dbt_` to `dbt_YOUR_NAME` replacing `YOUR_NAME` with your initials and name (`hwatson` is used in the lab screenshots). Be sure to click **Save** for your changes!
+
+
+4. We now have our own personal development schema, amazing! When we run our first dbt models they will build into this schema.
+5. Let’s open up dbt Cloud’s Integrated Development Environment (IDE) and familiarize ourselves. Choose **Develop** at the top of the UI.
+
+6. When the IDE is done loading, click **Initialize dbt project**. The initialization process creates a collection of files and folders necessary to run your dbt project.
+
+
+7. After the initialization is finished, you can view the files and folders in the file tree menu. As we move through the workshop we'll be sure to touch on a few key files and folders that we'll work with to build out our project.
+8. Next click **Commit and push** to commit the new files and folders from the initialize step. We always want our commit messages to be relevant to the work we're committing, so be sure to provide a message like `initialize project` and select **Commit Changes**.
+
+
+
+
+
+9. [Committing](https://www.atlassian.com/git/tutorials/saving-changes/git-commit) your work here will save it to the managed git repository that was created during the Partner Connect signup. This initial commit is the only commit that will be made directly to our `main` branch and from *here on out we'll be doing all of our work on a development branch*. This allows us to keep our development work separate from our production code.
+10. There are a couple of key features to point out about the IDE before we get to work. It is a text editor, an SQL and Python runner, and a CLI with Git version control all baked into one package! This allows you to focus on editing your SQL and Python files, previewing the results with the SQL runner (it even runs Jinja!), and building models at the command line without having to move between different applications. The Git workflow in dbt Cloud allows both Git beginners and experts alike to be able to easily version control all of their work with a couple clicks.
+
+
+
+11. Let's run our first dbt models! Two example models are included in your dbt project in the `models/examples` folder that we can use to illustrate how to run dbt at the command line. Type `dbt run` into the command line and click **Enter** on your keyboard. When the run bar expands you'll be able to see the results of the run, where you should see the run complete successfully.
+
+
+
+12. The run results allow you to see the code that dbt compiles and sends to Snowflake for execution. To view the logs for this run, select one of the model tabs using the **>** icon and then **Details**. If you scroll down a bit you'll be able to see the compiled code and how dbt interacts with Snowflake. Given that this run took place in our development environment, the models were created in your development schema.
+
+
+
+13. Now let's switch over to Snowflake to confirm that the objects were actually created. Click on the three dots **…** above your database objects and then **Refresh**. Expand the **PC_DBT_DB** database and you should see your development schema. Select the schema, then **Tables** and **Views**. Now you should be able to see `MY_FIRST_DBT_MODEL` as a table and `MY_SECOND_DBT_MODEL` as a view.
+
+
+## Create branch and set up project configs
+
+In this step, we’ll need to create a development branch and set up project level configurations.
+
+1. To get started with development for our project, we'll need to create a new Git branch for our work. Select **create branch** and name your development branch. We'll call our branch `snowpark_python_workshop` then click **Submit**.
+2. The first piece of development we'll do on the project is to update the `dbt_project.yml` file. Every dbt project requires a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. The [dbt_project.yml](/reference/dbt_project.yml) file also contains important information that tells dbt how to operate on your project.
+3. Select the `dbt_project.yml` file from the file tree to open it and replace all of the existing contents with the following code below. When you're done, save the file by clicking **save**. You can also use the Command-S or Control-S shortcut from here on out.
+
+ ```yaml
+ # Name your project! Project names should contain only lowercase characters
+ # and underscores. A good package name should reflect your organization's
+ # name or the intended use of these models
+ name: 'snowflake_dbt_python_formula1'
+ version: '1.3.0'
+ require-dbt-version: '>=1.3.0'
+ config-version: 2
+
+ # This setting configures which "profile" dbt uses for this project.
+ profile: 'default'
+
+ # These configurations specify where dbt should look for different types of files.
+ # The `model-paths` config, for example, states that models in this project can be
+ # found in the "models/" directory. You probably won't need to change these!
+ model-paths: ["models"]
+ analysis-paths: ["analyses"]
+ test-paths: ["tests"]
+ seed-paths: ["seeds"]
+ macro-paths: ["macros"]
+ snapshot-paths: ["snapshots"]
+
+ target-path: "target" # directory which will store compiled SQL files
+ clean-targets: # directories to be removed by `dbt clean`
+ - "target"
+ - "dbt_packages"
+
+ models:
+ snowflake_dbt_python_formula1:
+ staging:
+
+ +docs:
+ node_color: "CadetBlue"
+ marts:
+ +materialized: table
+ aggregates:
+ +docs:
+ node_color: "Maroon"
+ +tags: "bi"
+
+ core:
+ +docs:
+ node_color: "#800080"
+ intermediate:
+ +docs:
+ node_color: "MediumSlateBlue"
+ ml:
+ prep:
+ +docs:
+ node_color: "Indigo"
+ train_predict:
+ +docs:
+ node_color: "#36454f"
+
+ ```
+
+4. The key configurations to point out in the file with relation to the work that we're going to do are in the `models` section.
+ - `require-dbt-version` — Tells dbt which version of dbt to use for your project. We are requiring 1.3.0 and any newer version to run python models and node colors.
+ - `materialized` — Tells dbt how to materialize models when compiling the code before it pushes it down to Snowflake. All models in the `marts` folder will be built as tables.
+ - `tags` — Applies tags at a directory level to all models. All models in the `aggregates` folder will be tagged as `bi` (abbreviation for business intelligence).
+ - `docs` — Specifies the `node_color` either by the plain color name or a hex value.
+5. [Materializations](/docs/build/materializations) are strategies for persisting dbt models in a warehouse, with `tables` and `views` being the most commonly utilized types. By default, all dbt models are materialized as views and other materialization types can be configured in the `dbt_project.yml` file or in a model itself. It’s very important to note *Python models can only be materialized as tables or incremental models.* Since all our Python models exist under `marts`, the following portion of our `dbt_project.yml` ensures no errors will occur when we run our Python models. Starting with [dbt version 1.4](/docs/dbt-versions/core-upgrade/upgrading-to-v1.4#updates-to-python-models), Python files will automatically get materialized as tables even if not explicitly specified.
+
+ ```yaml
+ marts:
+ +materialized: table
+ ```
+
+## Create folders and organize files
+
+dbt Labs has developed a [project structure guide](/best-practices/how-we-structure/1-guide-overview/) that contains a number of recommendations for how to build the folder structure for your project. Do check out that guide if you want to learn more. Right now we are going to create some folders to organize our files:
+
+- Sources — This is our Formula 1 dataset and it will be defined in a source YAML file.
+- Staging models — These models have a 1:1 with their source table.
+- Intermediate — This is where we will be joining some Formula staging models.
+- Marts models — Here is where we perform our major transformations. It contains these subfolders:
+ - aggregates
+ - core
+ - ml
+
+1. In your file tree, use your cursor and hover over the `models` subdirectory, click the three dots **…** that appear to the right of the folder name, then select **Create Folder**. We're going to add two new folders to the file path, `staging` and `formula1` (in that order) by typing `staging/formula1` into the file path.
+
+
+
+
+ - If you click into your `models` directory now, you should see the new `staging` folder nested within `models` and the `formula1` folder nested within `staging`.
+2. Create two additional folders the same as the last step. Within the `models` subdirectory, create new directories `marts/core`.
+
+3. We will need to create a few more folders and subfolders using the UI. After you create all the necessary folders, your folder tree should look like this when it's all done:
+
+
+
+Remember you can always reference the entire project in [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1) to view the complete folder and file strucutre.
+
+## Create source and staging models
+
+In this section, we are going to create our source and staging models.
+
+Sources allow us to create a dependency between our source database object and our staging models which will help us when we look at later. Also, if your source changes database or schema, you only have to update it in your `f1_sources.yml` file rather than updating all of the models it might be used in.
+
+Staging models are the base of our project, where we bring all the individual components we're going to use to build our more complex and useful models into the project.
+
+Since we want to focus on dbt and Python in this workshop, check out our [sources](/docs/build/sources) and [staging](/best-practices/how-we-structure/2-staging) docs if you want to learn more (or take our [dbt Fundamentals](https://courses.getdbt.com/collections) course which covers all of our core functionality).
+
+### 1. Create sources
+
+We're going to be using each of our 8 Formula 1 tables from our `formula1` database under the `raw` schema for our transformations and we want to create those tables as sources in our project.
+
+1. Create a new file called `f1_sources.yml` with the following file path: `models/staging/formula1/f1_sources.yml`.
+2. Then, paste the following code into the file before saving it:
+
+```yaml
+version: 2
+
+sources:
+ - name: formula1
+ description: formula 1 datasets with normalized tables
+ database: formula1
+ schema: raw
+ tables:
+ - name: circuits
+ description: One record per circuit, which is the specific race course.
+ columns:
+ - name: circuitid
+ tests:
+ - unique
+ - not_null
+ - name: constructors
+ description: One record per constructor. Constructors are the teams that build their formula 1 cars.
+ columns:
+ - name: constructorid
+ tests:
+ - unique
+ - not_null
+ - name: drivers
+ description: One record per driver. This table gives details about the driver.
+ columns:
+ - name: driverid
+ tests:
+ - unique
+ - not_null
+ - name: lap_times
+ description: One row per lap in each race. Lap times started being recorded in this dataset in 1984 and joined through driver_id.
+ - name: pit_stops
+ description: One row per pit stop. Pit stops do not have their own id column, the combination of the race_id and driver_id identify the pit stop.
+ columns:
+ - name: stop
+ tests:
+ - accepted_values:
+ values: [1,2,3,4,5,6,7,8]
+ quote: false
+ - name: races
+ description: One race per row. Importantly this table contains the race year to understand trends.
+ columns:
+ - name: raceid
+ tests:
+ - unique
+ - not_null
+ - name: results
+ columns:
+ - name: resultid
+ tests:
+ - unique
+ - not_null
+ description: One row per result. The main table that we join out for grid and position variables.
+ - name: status
+ description: One status per row. The status contextualizes whether the race was finished or what issues arose e.g. collisions, engine, etc.
+ columns:
+ - name: statusid
+ tests:
+ - unique
+ - not_null
+```
+
+### 2. Create staging models
+
+The next step is to set up the staging models for each of the 8 source tables. Given the one-to-one relationship between staging models and their corresponding source tables, we'll build 8 staging models here. We know it’s a lot and in the future, we will seek to update the workshop to make this step less repetitive and more efficient. This step is also a good representation of the real world of data, where you have multiple hierarchical tables that you will need to join together!
+
+1. Let's go in alphabetical order to easily keep track of all our staging models! Create a new file called `stg_f1_circuits.sql` with this file path `models/staging/formula1/stg_f1_circuits.sql`. Then, paste the following code into the file before saving it:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','circuits') }}
+
+ ),
+
+ renamed as (
+ select
+ circuitid as circuit_id,
+ circuitref as circuit_ref,
+ name as circuit_name,
+ location,
+ country,
+ lat as latitude,
+ lng as longitude,
+ alt as altitude
+ -- omit the url
+ from source
+ )
+ select * from renamed
+ ```
+
+ All we're doing here is pulling the source data into the model using the `source` function, renaming some columns, and omitting the column `url` with a commented note since we don’t need it for our analysis.
+
+1. Create `stg_f1_constructors.sql` with this file path `models/staging/formula1/stg_f1_constructors.sql`. Paste the following code into it before saving the file:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','constructors') }}
+
+ ),
+
+ renamed as (
+ select
+ constructorid as constructor_id,
+ constructorref as constructor_ref,
+ name as constructor_name,
+ nationality as constructor_nationality
+ -- omit the url
+ from source
+ )
+
+ select * from renamed
+ ```
+
+ We have 6 other stages models to create. We can do this by creating new files, then copy and paste the code into our `staging` folder.
+
+1. Create `stg_f1_drivers.sql` with this file path `models/staging/formula1/stg_f1_drivers.sql`:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','drivers') }}
+
+ ),
+
+ renamed as (
+ select
+ driverid as driver_id,
+ driverref as driver_ref,
+ number as driver_number,
+ code as driver_code,
+ forename,
+ surname,
+ dob as date_of_birth,
+ nationality as driver_nationality
+ -- omit the url
+ from source
+ )
+
+ select * from renamed
+ ```
+
+1. Create `stg_f1_lap_times.sql` with this file path `models/staging/formula1/stg_f1_lap_times.sql`:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','lap_times') }}
+
+ ),
+
+ renamed as (
+ select
+ raceid as race_id,
+ driverid as driver_id,
+ lap,
+ position,
+ time as lap_time_formatted,
+ milliseconds as lap_time_milliseconds
+ from source
+ )
+
+ select * from renamed
+ ```
+
+1. Create `stg_f1_pit_stops.sql` with this file path `models/staging/formula1/stg_f1_pit_stops.sql`:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','pit_stops') }}
+
+ ),
+
+ renamed as (
+ select
+ raceid as race_id,
+ driverid as driver_id,
+ stop as stop_number,
+ lap,
+ time as lap_time_formatted,
+ duration as pit_stop_duration_seconds,
+ milliseconds as pit_stop_milliseconds
+ from source
+ )
+
+ select * from renamed
+ order by pit_stop_duration_seconds desc
+ ```
+
+1. Create `stg_f1_races.sql` with this file path `models/staging/formula1/stg_f1_races.sql`:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','races') }}
+
+ ),
+
+ renamed as (
+ select
+ raceid as race_id,
+ year as race_year,
+ round as race_round,
+ circuitid as circuit_id,
+ name as circuit_name,
+ date as race_date,
+ to_time(time) as race_time,
+ -- omit the url
+ fp1_date as free_practice_1_date,
+ fp1_time as free_practice_1_time,
+ fp2_date as free_practice_2_date,
+ fp2_time as free_practice_2_time,
+ fp3_date as free_practice_3_date,
+ fp3_time as free_practice_3_time,
+ quali_date as qualifying_date,
+ quali_time as qualifying_time,
+ sprint_date,
+ sprint_time
+ from source
+ )
+
+ select * from renamed
+ ```
+
+1. Create `stg_f1_results.sql` with this file path `models/staging/formula1/stg_f1_results.sql`:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','results') }}
+
+ ),
+
+ renamed as (
+ select
+ resultid as result_id,
+ raceid as race_id,
+ driverid as driver_id,
+ constructorid as constructor_id,
+ number as driver_number,
+ grid,
+ position::int as position,
+ positiontext as position_text,
+ positionorder as position_order,
+ points,
+ laps,
+ time as results_time_formatted,
+ milliseconds as results_milliseconds,
+ fastestlap as fastest_lap,
+ rank as results_rank,
+ fastestlaptime as fastest_lap_time_formatted,
+ fastestlapspeed::decimal(6,3) as fastest_lap_speed,
+ statusid as status_id
+ from source
+ )
+
+ select * from renamed
+ ```
+
+1. Last one! Create `stg_f1_status.sql` with this file path: `models/staging/formula1/stg_f1_status.sql`:
+
+ ```sql
+ with
+
+ source as (
+
+ select * from {{ source('formula1','status') }}
+
+ ),
+
+ renamed as (
+ select
+ statusid as status_id,
+ status
+ from source
+ )
+
+ select * from renamed
+ ```
+
+ After the source and all the staging models are complete for each of the 8 tables, your staging folder should look like this:
+
+
+
+1. It’s a good time to delete our example folder since these two models are extraneous to our formula1 pipeline and `my_first_model` fails a `not_null` test that we won’t spend time investigating. dbt Cloud will warn us that this folder will be permanently deleted, and we are okay with that so select **Delete**.
+
+
+
+1. Now that the staging models are built and saved, it's time to create the models in our development schema in Snowflake. To do this we're going to enter into the command line `dbt build` to run all of the models in our project, which includes the 8 new staging models and the existing example models.
+
+ Your run should complete successfully and you should see green checkmarks next to all of your models in the run results. We built our 8 staging models as views and ran 13 source tests that we configured in the `f1_sources.yml` file with not that much code, pretty cool!
+
+
+
+ Let's take a quick look in Snowflake, refresh database objects, open our development schema, and confirm that the new models are there. If you can see them, then we're good to go!
+
+
+
+ Before we move onto the next section, be sure to commit your new models to your Git branch. Click **Commit and push** and give your commit a message like `profile, sources, and staging setup` before moving on.
+
+## Transform SQL
+
+Now that we have all our sources and staging models done, it's time to move into where dbt shines — transformation!
+
+We need to:
+
+- Create some intermediate tables to join tables that aren’t hierarchical
+- Create core tables for business intelligence (BI) tool ingestion
+- Answer the two questions about:
+ - fastest pit stops
+ - lap time trends about our Formula 1 data by creating aggregate models using python!
+
+### Intermediate models
+
+We need to join lots of reference tables to our results table to create a human readable dataframe. What does this mean? For example, we don’t only want to have the numeric `status_id` in our table, we want to be able to read in a row of data that a driver could not finish a race due to engine failure (`status_id=5`).
+
+By now, we are pretty good at creating new files in the correct directories so we won’t cover this in detail. All intermediate models should be created in the path `models/intermediate`.
+
+1. Create a new file called `int_lap_times_years.sql`. In this model, we are joining our lap time and race information so we can look at lap times over years. In earlier Formula 1 eras, lap times were not recorded (only final results), so we filter out records where lap times are null.
+
+ ```sql
+ with lap_times as (
+
+ select * from {{ ref('stg_f1_lap_times') }}
+
+ ),
+
+ races as (
+
+ select * from {{ ref('stg_f1_races') }}
+
+ ),
+
+ expanded_lap_times_by_year as (
+ select
+ lap_times.race_id,
+ driver_id,
+ race_year,
+ lap,
+ lap_time_milliseconds
+ from lap_times
+ left join races
+ on lap_times.race_id = races.race_id
+ where lap_time_milliseconds is not null
+ )
+
+ select * from expanded_lap_times_by_year
+ ```
+
+2. Create a file called `in_pit_stops.sql`. Pit stops are a many-to-one (M:1) relationship with our races. We are creating a feature called `total_pit_stops_per_race` by partitioning over our `race_id` and `driver_id`, while preserving individual level pit stops for rolling average in our next section.
+
+ ```sql
+ with stg_f1__pit_stops as
+ (
+ select * from {{ ref('stg_f1_pit_stops') }}
+ ),
+
+ pit_stops_per_race as (
+ select
+ race_id,
+ driver_id,
+ stop_number,
+ lap,
+ lap_time_formatted,
+ pit_stop_duration_seconds,
+ pit_stop_milliseconds,
+ max(stop_number) over (partition by race_id,driver_id) as total_pit_stops_per_race
+ from stg_f1__pit_stops
+ )
+
+ select * from pit_stops_per_race
+ ```
+
+3. Create a file called `int_results.sql`. Here we are using 4 of our tables — `races`, `drivers`, `constructors`, and `status` — to give context to our `results` table. We are now able to calculate a new feature `drivers_age_years` by bringing the `date_of_birth` and `race_year` into the same table. We are also creating a column to indicate if the driver did not finish (dnf) the race, based upon if their `position` was null called, `dnf_flag`.
+
+ ```sql
+ with results as (
+
+ select * from {{ ref('stg_f1_results') }}
+
+ ),
+
+ races as (
+
+ select * from {{ ref('stg_f1_races') }}
+
+ ),
+
+ drivers as (
+
+ select * from {{ ref('stg_f1_drivers') }}
+
+ ),
+
+ constructors as (
+
+ select * from {{ ref('stg_f1_constructors') }}
+ ),
+
+ status as (
+
+ select * from {{ ref('stg_f1_status') }}
+ ),
+
+ int_results as (
+ select
+ result_id,
+ results.race_id,
+ race_year,
+ race_round,
+ circuit_id,
+ circuit_name,
+ race_date,
+ race_time,
+ results.driver_id,
+ results.driver_number,
+ forename ||' '|| surname as driver,
+ cast(datediff('year', date_of_birth, race_date) as int) as drivers_age_years,
+ driver_nationality,
+ results.constructor_id,
+ constructor_name,
+ constructor_nationality,
+ grid,
+ position,
+ position_text,
+ position_order,
+ points,
+ laps,
+ results_time_formatted,
+ results_milliseconds,
+ fastest_lap,
+ results_rank,
+ fastest_lap_time_formatted,
+ fastest_lap_speed,
+ results.status_id,
+ status,
+ case when position is null then 1 else 0 end as dnf_flag
+ from results
+ left join races
+ on results.race_id=races.race_id
+ left join drivers
+ on results.driver_id = drivers.driver_id
+ left join constructors
+ on results.constructor_id = constructors.constructor_id
+ left join status
+ on results.status_id = status.status_id
+ )
+
+ select * from int_results
+ ```
+
+1. Create a *Markdown* file `intermediate.md` that we will go over in depth in the Test and Documentation sections of the [Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake](/guides/dbt-python-snowpark) guide.
+
+ ```markdown
+ # the intent of this .md is to allow for multi-line long form explanations for our intermediate transformations
+
+ # below are descriptions
+ {% docs int_results %} In this query we want to join out other important information about the race results to have a human readable table about results, races, drivers, constructors, and status.
+ We will have 4 left joins onto our results table. {% enddocs %}
+
+ {% docs int_pit_stops %} There are many pit stops within one race, aka a M:1 relationship.
+ We want to aggregate this so we can properly join pit stop information without creating a fanout. {% enddocs %}
+
+ {% docs int_lap_times_years %} Lap times are done per lap. We need to join them out to the race year to understand yearly lap time trends. {% enddocs %}
+ ```
+
+1. Create a *YAML* file `intermediate.yml` that we will go over in depth during the Test and Document sections of the [Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake](/guides/dbt-python-snowpark) guide.
+
+ ```yaml
+ version: 2
+
+ models:
+ - name: int_results
+ description: '{{ doc("int_results") }}'
+ - name: int_pit_stops
+ description: '{{ doc("int_pit_stops") }}'
+ - name: int_lap_times_years
+ description: '{{ doc("int_lap_times_years") }}'
+ ```
+
+ That wraps up the intermediate models we need to create our core models!
+
+### Core models
+
+1. Create a file `fct_results.sql`. This is what I like to refer to as the “mega table” — a really large denormalized table with all our context added in at row level for human readability. Importantly, we have a table `circuits` that is linked through the table `races`. When we joined `races` to `results` in `int_results.sql` we allowed our tables to make the connection from `circuits` to `results` in `fct_results.sql`. We are only taking information about pit stops at the result level so our join would not cause a [fanout](https://community.looker.com/technical-tips-tricks-1021/what-is-a-fanout-23327).
+
+ ```sql
+ with int_results as (
+
+ select * from {{ ref('int_results') }}
+
+ ),
+
+ int_pit_stops as (
+ select
+ race_id,
+ driver_id,
+ max(total_pit_stops_per_race) as total_pit_stops_per_race
+ from {{ ref('int_pit_stops') }}
+ group by 1,2
+ ),
+
+ circuits as (
+
+ select * from {{ ref('stg_f1_circuits') }}
+ ),
+ base_results as (
+ select
+ result_id,
+ int_results.race_id,
+ race_year,
+ race_round,
+ int_results.circuit_id,
+ int_results.circuit_name,
+ circuit_ref,
+ location,
+ country,
+ latitude,
+ longitude,
+ altitude,
+ total_pit_stops_per_race,
+ race_date,
+ race_time,
+ int_results.driver_id,
+ driver,
+ driver_number,
+ drivers_age_years,
+ driver_nationality,
+ constructor_id,
+ constructor_name,
+ constructor_nationality,
+ grid,
+ position,
+ position_text,
+ position_order,
+ points,
+ laps,
+ results_time_formatted,
+ results_milliseconds,
+ fastest_lap,
+ results_rank,
+ fastest_lap_time_formatted,
+ fastest_lap_speed,
+ status_id,
+ status,
+ dnf_flag
+ from int_results
+ left join circuits
+ on int_results.circuit_id=circuits.circuit_id
+ left join int_pit_stops
+ on int_results.driver_id=int_pit_stops.driver_id and int_results.race_id=int_pit_stops.race_id
+ )
+
+ select * from base_results
+ ```
+
+1. Create the file `pit_stops_joined.sql`. Our results and pit stops are at different levels of dimensionality (also called grain). Simply put, we have multiple pit stops per a result. Since we are interested in understanding information at the pit stop level with information about race year and constructor, we will create a new table `pit_stops_joined.sql` where each row is per pit stop. Our new table tees up our aggregation in Python.
+
+ ```sql
+ with base_results as (
+
+ select * from {{ ref('fct_results') }}
+
+ ),
+
+ pit_stops as (
+
+ select * from {{ ref('int_pit_stops') }}
+
+ ),
+
+ pit_stops_joined as (
+
+ select
+ base_results.race_id,
+ race_year,
+ base_results.driver_id,
+ constructor_id,
+ constructor_name,
+ stop_number,
+ lap,
+ lap_time_formatted,
+ pit_stop_duration_seconds,
+ pit_stop_milliseconds
+ from base_results
+ left join pit_stops
+ on base_results.race_id=pit_stops.race_id and base_results.driver_id=pit_stops.driver_id
+ )
+ select * from pit_stops_joined
+ ```
+
+1. Enter in the command line and execute `dbt build` to build out our entire pipeline to up to this point. Don’t worry about “overriding” your previous models – dbt workflows are designed to be idempotent so we can run them again and expect the same results.
+
+1. Let’s talk about our lineage so far. It’s looking good 😎. We’ve shown how SQL can be used to make data type, column name changes, and handle hierarchical joins really well; all while building out our automated lineage!
+
+
+
+1. Time to **Commit and push** our changes and give your commit a message like `intermediate and fact models` before moving on.
+
+## Running dbt Python models
+
+Up until now, SQL has been driving the project (car pun intended) for data cleaning and hierarchical joining. Now it’s time for Python to take the wheel (car pun still intended) for the rest of our lab! For more information about running Python models on dbt, check out our [docs](/docs/build/python-models). To learn more about dbt python works under the hood, check out [Snowpark for Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html), which makes running dbt Python models possible.
+
+There are quite a few differences between SQL and Python in terms of the dbt syntax and DDL, so we’ll be breaking our code and model runs down further for our python models.
+
+### Pit stop analysis
+
+First, we want to find out: which constructor had the fastest pit stops in 2021? (constructor is a Formula 1 team that builds or “constructs” the car).
+
+1. Create a new file called `fastest_pit_stops_by_constructor.py` in our `aggregates` (this is the first time we are using the `.py` extension!).
+2. Copy the following code into the file:
+
+ ```python
+ import numpy as np
+ import pandas as pd
+
+ def model(dbt, session):
+ # dbt configuration
+ dbt.config(packages=["pandas","numpy"])
+
+ # get upstream data
+ pit_stops_joined = dbt.ref("pit_stops_joined").to_pandas()
+
+ # provide year so we do not hardcode dates
+ year=2021
+
+ # describe the data
+ pit_stops_joined["PIT_STOP_SECONDS"] = pit_stops_joined["PIT_STOP_MILLISECONDS"]/1000
+ fastest_pit_stops = pit_stops_joined[(pit_stops_joined["RACE_YEAR"]==year)].groupby(by="CONSTRUCTOR_NAME")["PIT_STOP_SECONDS"].describe().sort_values(by='mean')
+ fastest_pit_stops.reset_index(inplace=True)
+ fastest_pit_stops.columns = fastest_pit_stops.columns.str.upper()
+
+ return fastest_pit_stops.round(2)
+ ```
+
+3. Let’s break down what this code is doing step by step:
+ - First, we are importing the Python libraries that we are using. A *library* is a reusable chunk of code that someone else wrote that you may want to include in your programs/projects. We are using `numpy` and `pandas`in this Python model. This is similar to a dbt *package*, but our Python libraries do *not* persist across the entire project.
+ - Defining a function called `model` with the parameter `dbt` and `session`. The parameter `dbt` is a class compiled by dbt, which enables you to run your Python code in the context of your dbt project and DAG. The parameter `session` is a class representing your Snowflake’s connection to the Python backend. The `model` function *must return a single DataFrame*. You can see that all the data transformation happening is within the body of the `model` function that the `return` statement is tied to.
+ - Then, within the context of our dbt model library, we are passing in a configuration of which packages we need using `dbt.config(packages=["pandas","numpy"])`.
+ - Use the `.ref()` function to retrieve the data frame `pit_stops_joined` that we created in our last step using SQL. We cast this to a pandas dataframe (by default it's a Snowpark Dataframe).
+ - Create a variable named `year` so we aren’t passing a hardcoded value.
+ - Generate a new column called `PIT_STOP_SECONDS` by dividing the value of `PIT_STOP_MILLISECONDS` by 1000.
+ - Create our final data frame `fastest_pit_stops` that holds the records where year is equal to our year variable (2021 in this case), then group the data frame by `CONSTRUCTOR_NAME` and use the `describe()` and `sort_values()` and in descending order. This will make our first row in the new aggregated data frame the team with the fastest pit stops over an entire competition year.
+ - Finally, it resets the index of the `fastest_pit_stops` data frame. The `reset_index()` method allows you to reset the index back to the default 0, 1, 2, etc indexes. By default, this method will keep the "old" indexes in a column named "index"; to avoid this, use the drop parameter. Think of this as keeping your data “flat and square” as opposed to “tiered”. If you are new to Python, now might be a good time to [learn about indexes for 5 minutes](https://towardsdatascience.com/the-basics-of-indexing-and-slicing-python-lists-2d12c90a94cf) since it's the foundation of how Python retrieves, slices, and dices data. The `inplace` argument means we override the existing data frame permanently. Not to fear! This is what we want to do to avoid dealing with multi-indexed dataframes!
+ - Convert our Python column names to all uppercase using `.upper()`, so Snowflake recognizes them.
+ - Finally we are returning our dataframe with 2 decimal places for all the columns using the `round()` method.
+4. Zooming out a bit, what are we doing differently here in Python from our typical SQL code:
+ - Method chaining is a technique in which multiple methods are called on an object in a single statement, with each method call modifying the result of the previous one. The methods are called in a chain, with the output of one method being used as the input for the next one. The technique is used to simplify the code and make it more readable by eliminating the need for intermediate variables to store the intermediate results.
+ - The way you see method chaining in Python is the syntax `.().()`. For example, `.describe().sort_values(by='mean')` where the `.describe()` method is chained to `.sort_values()`.
+ - The `.describe()` method is used to generate various summary statistics of the dataset. It's used on pandas dataframe. It gives a quick and easy way to get the summary statistics of your dataset without writing multiple lines of code.
+ - The `.sort_values()` method is used to sort a pandas dataframe or a series by one or multiple columns. The method sorts the data by the specified column(s) in ascending or descending order. It is the pandas equivalent to `order by` in SQL.
+
+ We won’t go as in depth for our subsequent scripts, but will continue to explain at a high level what new libraries, functions, and methods are doing.
+
+5. Build the model using the UI which will **execute**:
+
+ ```bash
+ dbt run --select fastest_pit_stops_by_constructor
+ ```
+
+ in the command bar.
+
+ Let’s look at some details of our first Python model to see what our model executed. There two major differences we can see while running a Python model compared to an SQL model:
+
+ - Our Python model was executed as a stored procedure. Snowflake needs a way to know that it's meant to execute this code in a Python runtime, instead of interpreting in a SQL runtime. We do this by creating a Python stored proc, called by a SQL command.
+ - The `snowflake-snowpark-python` library has been picked up to execute our Python code. Even though this wasn’t explicitly stated this is picked up by the dbt class object because we need our Snowpark package to run Python!
+
+ Python models take a bit longer to run than SQL models, however we could always speed this up by using [Snowpark-optimized Warehouses](https://docs.snowflake.com/en/user-guide/warehouses-snowpark-optimized.html) if we wanted to. Our data is sufficiently small, so we won’t worry about creating a separate warehouse for Python versus SQL files today.
+
+
+ The rest of our **Details** output gives us information about how dbt and Snowpark for Python are working together to define class objects and apply a specific set of methods to run our models.
+
+ So which constructor had the fastest pit stops in 2021? Let’s look at our data to find out!
+
+6. We can't preview Python models directly, so let’s create a new file using the **+** button or the Control-n shortcut to create a new scratchpad.
+7. Reference our Python model:
+
+ ```sql
+ select * from {{ ref('fastest_pit_stops_by_constructor') }}
+ ```
+
+ and preview the output:
+
+
+ Not only did Red Bull have the fastest average pit stops by nearly 40 seconds, they also had the smallest standard deviation, meaning they are both fastest and most consistent teams in pit stops. By using the `.describe()` method we were able to avoid verbose SQL requiring us to create a line of code per column and repetitively use the `PERCENTILE_COUNT()` function.
+
+ Now we want to find the lap time average and rolling average through the years (is it generally trending up or down)?
+
+8. Create a new file called `lap_times_moving_avg.py` in our `aggregates` folder.
+9. Copy the following code into the file:
+
+ ```python
+ import pandas as pd
+
+ def model(dbt, session):
+ # dbt configuration
+ dbt.config(packages=["pandas"])
+
+ # get upstream data
+ lap_times = dbt.ref("int_lap_times_years").to_pandas()
+
+ # describe the data
+ lap_times["LAP_TIME_SECONDS"] = lap_times["LAP_TIME_MILLISECONDS"]/1000
+ lap_time_trends = lap_times.groupby(by="RACE_YEAR")["LAP_TIME_SECONDS"].mean().to_frame()
+ lap_time_trends.reset_index(inplace=True)
+ lap_time_trends["LAP_MOVING_AVG_5_YEARS"] = lap_time_trends["LAP_TIME_SECONDS"].rolling(5).mean()
+ lap_time_trends.columns = lap_time_trends.columns.str.upper()
+
+ return lap_time_trends.round(1)
+ ```
+
+10. Breaking down our code a bit:
+ - We’re only using the `pandas` library for this model and casting it to a pandas data frame `.to_pandas()`.
+ - Generate a new column called `LAP_TIMES_SECONDS` by dividing the value of `LAP_TIME_MILLISECONDS` by 1000.
+ - Create the final dataframe. Get the lap time per year. Calculate the mean series and convert to a data frame.
+ - Reset the index.
+ - Calculate the rolling 5 year mean.
+ - Round our numeric columns to one decimal place.
+11. Now, run this model by using the UI **Run model** or
+
+ ```bash
+ dbt run --select lap_times_moving_avg
+ ```
+
+ in the command bar.
+
+12. Once again previewing the output of our data using the same steps for our `fastest_pit_stops_by_constructor` model.
+
+
+ We can see that it looks like lap times are getting consistently faster over time. Then in 2010 we see an increase occur! Using outside subject matter context, we know that significant rule changes were introduced to Formula 1 in 2010 and 2011 causing slower lap times.
+
+13. Now is a good time to checkpoint and commit our work to Git. Click **Commit and push** and give your commit a message like `aggregate python models` before moving on.
+
+### The dbt model, .source(), .ref() and .config() functions
+
+Let’s take a step back before starting machine learning to both review and go more in-depth at the methods that make running dbt python models possible. If you want to know more outside of this lab’s explanation read the documentation [here](/docs/build/python-models?version=1.3).
+
+- dbt model(dbt, session). For starters, each Python model lives in a .py file in your models/ folder. It defines a function named `model()`, which takes two parameters:
+ - dbt — A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG.
+ - session — A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames and to write DataFrames back to tables. In PySpark, by convention, the SparkSession is named spark, and available globally. For consistency across platforms, we always pass it into the model function as an explicit argument called session.
+- The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame.
+- `.source()` and `.ref()` functions. Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. If you want to read directly from a raw source table, use `dbt.source()`. We saw this in our earlier section using SQL with the source function. These functions have the same execution, but with different syntax. Use the `dbt.ref()` method within a Python model to read data from other models (SQL or Python). These methods return DataFrames pointing to the upstream source, model, seed, or snapshot.
+- `.config()`. Just like SQL models, there are three ways to configure Python models:
+ - In a dedicated `.yml` file, within the `models/` directory
+ - Within the model's `.py` file, using the `dbt.config()` method
+ - Calling the `dbt.config()` method will set configurations for your model within your `.py` file, similar to the `{{ config() }} macro` in `.sql` model files:
+
+ ```python
+ def model(dbt, session):
+
+ # setting configuration
+ dbt.config(materialized="table")
+ ```
+ - There's a limit to how complex you can get with the `dbt.config()` method. It accepts only literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `.config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the config property in a [YAML file](/reference/resource-properties/config). Learn more about configurations [here](/reference/model-configs).
+
+## Prepare for machine learning: cleaning, encoding, and splits
+
+Now that we’ve gained insights and business intelligence about Formula 1 at a descriptive level, we want to extend our capabilities into prediction. We’re going to take the scenario where we censor the data. This means that we will pretend that we will train a model using earlier data and apply it to future data. In practice, this means we’ll take data from 2010-2019 to train our model and then predict 2020 data.
+
+In this section, we’ll be preparing our data to predict the final race position of a driver.
+
+At a high level we’ll be:
+
+- Creating new prediction features and filtering our dataset to active drivers
+- Encoding our data (algorithms like numbers) and simplifying our target variable called `position`
+- Splitting our dataset into training, testing, and validation
+
+### ML data prep
+
+1. To keep our project organized, we’ll need to create two new subfolders in our `ml` directory. Under the `ml` folder, make the subfolders `prep` and `train_predict`.
+2. Create a new file under `ml/prep` called `ml_data_prep`. Copy the following code into the file and **Save**.
+
+ ```python
+ import pandas as pd
+
+ def model(dbt, session):
+ # dbt configuration
+ dbt.config(packages=["pandas"])
+
+ # get upstream data
+ fct_results = dbt.ref("fct_results").to_pandas()
+
+ # provide years so we do not hardcode dates in filter command
+ start_year=2010
+ end_year=2020
+
+ # describe the data for a full decade
+ data = fct_results.loc[fct_results['RACE_YEAR'].between(start_year, end_year)]
+
+ # convert string to an integer
+ data['POSITION'] = data['POSITION'].astype(float)
+
+ # we cannot have nulls if we want to use total pit stops
+ data['TOTAL_PIT_STOPS_PER_RACE'] = data['TOTAL_PIT_STOPS_PER_RACE'].fillna(0)
+
+ # some of the constructors changed their name over the year so replacing old names with current name
+ mapping = {'Force India': 'Racing Point', 'Sauber': 'Alfa Romeo', 'Lotus F1': 'Renault', 'Toro Rosso': 'AlphaTauri'}
+ data['CONSTRUCTOR_NAME'].replace(mapping, inplace=True)
+
+ # create confidence metrics for drivers and constructors
+ dnf_by_driver = data.groupby('DRIVER').sum()['DNF_FLAG']
+ driver_race_entered = data.groupby('DRIVER').count()['DNF_FLAG']
+ driver_dnf_ratio = (dnf_by_driver/driver_race_entered)
+ driver_confidence = 1-driver_dnf_ratio
+ driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence))
+
+ dnf_by_constructor = data.groupby('CONSTRUCTOR_NAME').sum()['DNF_FLAG']
+ constructor_race_entered = data.groupby('CONSTRUCTOR_NAME').count()['DNF_FLAG']
+ constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered)
+ constructor_relaiblity = 1-constructor_dnf_ratio
+ constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity))
+
+ data['DRIVER_CONFIDENCE'] = data['DRIVER'].apply(lambda x:driver_confidence_dict[x])
+ data['CONSTRUCTOR_RELAIBLITY'] = data['CONSTRUCTOR_NAME'].apply(lambda x:constructor_relaiblity_dict[x])
+
+ #removing retired drivers and constructors
+ active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes',
+ 'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull',
+ 'Haas F1 Team']
+ active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz',
+ 'Valtteri Bottas', 'Lance Stroll', 'George Russell',
+ 'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen',
+ 'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat',
+ 'Max Verstappen', 'Pierre Gasly', 'Alexander Albon',
+ 'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi',
+ 'Romain Grosjean','Nicholas Latifi']
+
+ # create flags for active drivers and constructors so we can filter downstream
+ data['ACTIVE_DRIVER'] = data['DRIVER'].apply(lambda x: int(x in active_drivers))
+ data['ACTIVE_CONSTRUCTOR'] = data['CONSTRUCTOR_NAME'].apply(lambda x: int(x in active_constructors))
+
+ return data
+ ```
+
+3. As usual, let’s break down what we are doing in this Python model:
+ - We’re first referencing our upstream `fct_results` table and casting it to a pandas dataframe.
+ - Filtering on years 2010-2020 since we’ll need to clean all our data we are using for prediction (both training and testing).
+ - Filling in empty data for `total_pit_stops` and making a mapping active constructors and drivers to avoid erroneous predictions
+ - ⚠️ You might be wondering why we didn’t do this upstream in our `fct_results` table! The reason for this is that we want our machine learning cleanup to reflect the year 2020 for our predictions and give us an up-to-date team name. However, for business intelligence purposes we can keep the historical data at that point in time. Instead of thinking of one table as “one source of truth” we are creating different datasets fit for purpose: one for historical descriptions and reporting and another for relevant predictions.
+ - Create new confidence features for drivers and constructors
+ - Generate flags for the constructors and drivers that were active in 2020
+4. Execute the following in the command bar:
+
+ ```bash
+ dbt run --select ml_data_prep
+ ```
+
+5. There are more aspects we could consider for this project, such as normalizing the driver confidence by the number of races entered. Including this would help account for a driver’s history and consider whether they are a new or long-time driver. We’re going to keep it simple for now, but these are some of the ways we can expand and improve our machine learning dbt projects. Breaking down our machine learning prep model:
+ - Lambda functions — We use some lambda functions to transform our data without having to create a fully-fledged function using the `def` notation. So what exactly are lambda functions?
+ - In Python, a lambda function is a small, anonymous function defined using the keyword "lambda". Lambda functions are used to perform a quick operation, such as a mathematical calculation or a transformation on a list of elements. They are often used in conjunction with higher-order functions, such as `apply`, `map`, `filter`, and `reduce`.
+ - `.apply()` method — We used `.apply()` to pass our functions into our lambda expressions to the columns and perform this multiple times in our code. Let’s explain apply a little more:
+ - The `.apply()` function in the pandas library is used to apply a function to a specified axis of a DataFrame or a Series. In our case the function we used was our lambda function!
+ - The `.apply()` function takes two arguments: the first is the function to be applied, and the second is the axis along which the function should be applied. The axis can be specified as 0 for rows or 1 for columns. We are using the default value of 0 so we aren’t explicitly writing it in the code. This means that the function will be applied to each *row* of the DataFrame or Series.
+6. Let’s look at the preview of our clean dataframe after running our `ml_data_prep` model:
+
+
+### Covariate encoding
+
+In this next part, we’ll be performing covariate encoding. Breaking down this phrase a bit, a *covariate* is a variable that is relevant to the outcome of a study or experiment, and *encoding* refers to the process of converting data (such as text or categorical variables) into a numerical format that can be used as input for a model. This is necessary because most machine learning algorithms can only work with numerical data. Algorithms don’t speak languages, have eyes to see images, etc. so we encode our data into numbers so algorithms can perform tasks by using calculations they otherwise couldn’t.
+
+🧠 We’ll think about this as : “algorithms like numbers”.
+
+1. Create a new file under `ml/prep` called `covariate_encoding` copy the code below and save.
+
+ ```python
+ import pandas as pd
+ import numpy as np
+ from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
+ from sklearn.linear_model import LogisticRegression
+
+ def model(dbt, session):
+ # dbt configuration
+ dbt.config(packages=["pandas","numpy","scikit-learn"])
+
+ # get upstream data
+ data = dbt.ref("ml_data_prep").to_pandas()
+
+ # list out covariates we want to use in addition to outcome variable we are modeling - position
+ covariates = data[['RACE_YEAR','CIRCUIT_NAME','GRID','CONSTRUCTOR_NAME','DRIVER','DRIVERS_AGE_YEARS','DRIVER_CONFIDENCE','CONSTRUCTOR_RELAIBLITY','TOTAL_PIT_STOPS_PER_RACE','ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR', 'POSITION']]
+
+ # filter covariates on active drivers and constructors
+ # use fil_cov as short for "filtered_covariates"
+ fil_cov = covariates[(covariates['ACTIVE_DRIVER']==1)&(covariates['ACTIVE_CONSTRUCTOR']==1)]
+
+ # Encode categorical variables using LabelEncoder
+ # TODO: we'll update this to both ohe in the future for non-ordinal variables!
+ le = LabelEncoder()
+ fil_cov['CIRCUIT_NAME'] = le.fit_transform(fil_cov['CIRCUIT_NAME'])
+ fil_cov['CONSTRUCTOR_NAME'] = le.fit_transform(fil_cov['CONSTRUCTOR_NAME'])
+ fil_cov['DRIVER'] = le.fit_transform(fil_cov['DRIVER'])
+ fil_cov['TOTAL_PIT_STOPS_PER_RACE'] = le.fit_transform(fil_cov['TOTAL_PIT_STOPS_PER_RACE'])
+
+ # Simply target variable "position" to represent 3 meaningful categories in Formula1
+ # 1. Podium position 2. Points for team 3. Nothing - no podium or points!
+ def position_index(x):
+ if x<4:
+ return 1
+ if x>10:
+ return 3
+ else :
+ return 2
+
+ # we are dropping the columns that we filtered on in addition to our training variable
+ encoded_data = fil_cov.drop(['ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR'],1)
+ encoded_data['POSITION_LABEL']= encoded_data['POSITION'].apply(lambda x: position_index(x))
+ encoded_data_grouped_target = encoded_data.drop(['POSITION'],1)
+
+ return encoded_data_grouped_target
+ ```
+
+2. Execute the following in the command bar:
+
+ ```bash
+ dbt run --select covariate_encoding
+ ```
+
+3. In this code, we are using a ton of functions from libraries! This is really cool, because we can utilize code other people have developed and bring it into our project simply by using the `import` function. [Scikit-learn](https://scikit-learn.org/stable/), “sklearn” for short, is an extremely popular data science library. Sklearn contains a wide range of machine learning techniques, including supervised and unsupervised learning algorithms, feature scaling and imputation, as well as tools model evaluation and selection. We’ll be using Sklearn for both preparing our covariates and creating models (our next section).
+4. Our dataset is pretty small data so we are good to use pandas and `sklearn`. If you have larger data for your own project in mind, consider `dask` or `category_encoders`.
+5. Breaking it down a bit more:
+ - We’re selecting a subset of variables that will be used as predictors for a driver’s position.
+ - Filter the dataset to only include rows using the active driver and constructor flags we created in the last step.
+ - The next step is to use the `LabelEncoder` from scikit-learn to convert the categorical variables `CIRCUIT_NAME`, `CONSTRUCTOR_NAME`, `DRIVER`, and `TOTAL_PIT_STOPS_PER_RACE` into numerical values.
+ - Create a new variable called `POSITION_LABEL`, which is a derived from our position variable.
+ - 💭 Why are we changing our position variable? There are 20 total positions in Formula 1 and we are grouping them together to simplify the classification and improve performance. We also want to demonstrate you can create a new function within your dbt model!
+ - Our new `position_label` variable has meaning:
+ - In Formula1 if you are in:
+ - Top 3 you get a “podium” position
+ - Top 10 you gain points that add to your overall season total
+ - Below top 10 you get no points!
+ - We are mapping our original variable position to `position_label` to the corresponding places above to 1,2, and 3 respectively.
+ - Drop the active driver and constructor flags since they were filter criteria and additionally drop our original position variable.
+
+### Splitting into training and testing datasets
+
+Now that we’ve cleaned and encoded our data, we are going to further split in by time. In this step, we will create dataframes to use for training and prediction. We’ll be creating two dataframes 1) using data from 2010-2019 for training, and 2) data from 2020 for new prediction inferences. We’ll create variables called `start_year` and `end_year` so we aren’t filtering on hardcasted values (and can more easily swap them out in the future if we want to retrain our model on different timeframes).
+
+1. Create a file called `train_test_dataset` copy and save the following code:
+
+ ```python
+ import pandas as pd
+
+ def model(dbt, session):
+
+ # dbt configuration
+ dbt.config(packages=["pandas"], tags="train")
+
+ # get upstream data
+ encoding = dbt.ref("covariate_encoding").to_pandas()
+
+ # provide years so we do not hardcode dates in filter command
+ start_year=2010
+ end_year=2019
+
+ # describe the data for a full decade
+ train_test_dataset = encoding.loc[encoding['RACE_YEAR'].between(start_year, end_year)]
+
+ return train_test_dataset
+ ```
+
+2. Create a file called `hold_out_dataset_for_prediction` copy and save the following code below. Now we’ll have a dataset with only the year 2020 that we’ll keep as a hold out set that we are going to use similar to a deployment use case.
+
+ ```python
+ import pandas as pd
+
+ def model(dbt, session):
+ # dbt configuration
+ dbt.config(packages=["pandas"], tags="predict")
+
+ # get upstream data
+ encoding = dbt.ref("covariate_encoding").to_pandas()
+
+ # variable for year instead of hardcoding it
+ year=2020
+
+ # filter the data based on the specified year
+ hold_out_dataset = encoding.loc[encoding['RACE_YEAR'] == year]
+
+ return hold_out_dataset
+ ```
+
+3. Execute the following in the command bar:
+
+ ```bash
+ dbt run --select train_test_dataset hold_out_dataset_for_prediction
+ ```
+
+ To run our temporal data split models, we can use this syntax in the command line to run them both at once. Make sure you use a *space* [syntax](/reference/node-selection/syntax) between the model names to indicate you want to run both!
+4. **Commit and push** our changes to keep saving our work as we go using `ml data prep and splits` before moving on.
+
+👏 Now that we’ve finished our machine learning prep work we can move onto the fun part — training and prediction!
+
+
+## Training a model to predict in machine learning
+
+We’re ready to start training a model to predict the driver’s position. Now is a good time to pause and take a step back and say, usually in ML projects you’ll try multiple algorithms during development and use an evaluation method such as cross validation to determine which algorithm to use. You can definitely do this in your dbt project, but for the content of this lab we’ll have decided on using a logistic regression to predict position (we actually tried some other algorithms using cross validation outside of this lab such as k-nearest neighbors and a support vector classifier but that didn’t perform as well as the logistic regression and a decision tree that overfit).
+
+There are 3 areas to break down as we go since we are working at the intersection all within one model file:
+
+1. Machine Learning
+2. Snowflake and Snowpark
+3. dbt Python models
+
+If you haven’t seen code like this before or use joblib files to save machine learning models, we’ll be going over them at a high level and you can explore the links for more technical in-depth along the way! Because Snowflake and dbt have abstracted away a lot of the nitty gritty about serialization and storing our model object to be called again, we won’t go into too much detail here. There’s *a lot* going on here so take it at your pace!
+
+### Training and saving a machine learning model
+
+1. Project organization remains key, so let’s make a new subfolder called `train_predict` under the `ml` folder.
+2. Now create a new file called `train_test_position` and copy and save the following code:
+
+ ```python
+ import snowflake.snowpark.functions as F
+ from sklearn.model_selection import train_test_split
+ import pandas as pd
+ from sklearn.metrics import confusion_matrix, balanced_accuracy_score
+ import io
+ from sklearn.linear_model import LogisticRegression
+ from joblib import dump, load
+ import joblib
+ import logging
+ import sys
+ from joblib import dump, load
+
+ logger = logging.getLogger("mylog")
+
+ def save_file(session, model, path, dest_filename):
+ input_stream = io.BytesIO()
+ joblib.dump(model, input_stream)
+ session._conn.upload_stream(input_stream, path, dest_filename)
+ return "successfully created file: " + path
+
+ def model(dbt, session):
+ dbt.config(
+ packages = ['numpy','scikit-learn','pandas','numpy','joblib','cachetools'],
+ materialized = "table",
+ tags = "train"
+ )
+ # Create a stage in Snowflake to save our model file
+ session.sql('create or replace stage MODELSTAGE').collect()
+
+ #session._use_scoped_temp_objects = False
+ version = "1.0"
+ logger.info('Model training version: ' + version)
+
+ # read in our training and testing upstream dataset
+ test_train_df = dbt.ref("train_test_dataset")
+
+ # cast snowpark df to pandas df
+ test_train_pd_df = test_train_df.to_pandas()
+ target_col = "POSITION_LABEL"
+
+ # split out covariate predictors, x, from our target column position_label, y.
+ split_X = test_train_pd_df.drop([target_col], axis=1)
+ split_y = test_train_pd_df[target_col]
+
+ # Split out our training and test data into proportions
+ X_train, X_test, y_train, y_test = train_test_split(split_X, split_y, train_size=0.7, random_state=42)
+ train = [X_train, y_train]
+ test = [X_test, y_test]
+ # now we are only training our one model to deploy
+ # we are keeping the focus on the workflows and not algorithms for this lab!
+ model = LogisticRegression()
+
+ # fit the preprocessing pipeline and the model together
+ model.fit(X_train, y_train)
+ y_pred = model.predict_proba(X_test)[:,1]
+ predictions = [round(value) for value in y_pred]
+ balanced_accuracy = balanced_accuracy_score(y_test, predictions)
+
+ # Save the model to a stage
+ save_file(session, model, "@MODELSTAGE/driver_position_"+version, "driver_position_"+version+".joblib" )
+ logger.info('Model artifact:' + "@MODELSTAGE/driver_position_"+version+".joblib")
+
+ # Take our pandas training and testing dataframes and put them back into snowpark dataframes
+ snowpark_train_df = session.write_pandas(pd.concat(train, axis=1, join='inner'), "train_table", auto_create_table=True, create_temp_table=True)
+ snowpark_test_df = session.write_pandas(pd.concat(test, axis=1, join='inner'), "test_table", auto_create_table=True, create_temp_table=True)
+
+ # Union our training and testing data together and add a column indicating train vs test rows
+ return snowpark_train_df.with_column("DATASET_TYPE", F.lit("train")).union(snowpark_test_df.with_column("DATASET_TYPE", F.lit("test")))
+ ```
+
+3. Execute the following in the command bar:
+
+ ```bash
+ dbt run --select train_test_position
+ ```
+
+4. Breaking down our Python script here:
+ - We’re importing some helpful libraries.
+ - Defining a function called `save_file()` that takes four parameters: `session`, `model`, `path` and `dest_filename` that will save our logistic regression model file.
+ - `session` — an object representing a connection to Snowflake.
+ - `model` — an object that needs to be saved. In this case, it's a Python object that is a scikit-learn that can be serialized with joblib.
+ - `path` — a string representing the directory or bucket location where the file should be saved.
+ - `dest_filename` — a string representing the desired name of the file.
+ - Creating our dbt model
+ - Within this model we are creating a stage called `MODELSTAGE` to place our logistic regression `joblib` model file. This is really important since we need a place to keep our model to reuse and want to ensure it's there. When using Snowpark commands, it's common to see the `.collect()` method to ensure the action is performed. Think of the session as our “start” and collect as our “end” when [working with Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) (you can use other ending methods other than collect).
+ - Using `.ref()` to connect into our `train_test_dataset` model.
+ - Now we see the machine learning part of our analysis:
+ - Create new dataframes for our prediction features from our target variable `position_label`.
+ - Split our dataset into 70% training (and 30% testing), train_size=0.7 with a `random_state` specified to have repeatable results.
+ - Specify our model is a logistic regression.
+ - Fit our model. In a logistic regression this means finding the coefficients that will give the least classification error.
+ - Round our predictions to the nearest integer since logistic regression creates a probability between for each class and calculate a balanced accuracy to account for imbalances in the target variable.
+ - Right now our model is only in memory, so we need to use our nifty function `save_file` to save our model file to our Snowflake stage. We save our model as a joblib file so Snowpark can easily call this model object back to create predictions. We really don’t need to know much else as a data practitioner unless we want to. It’s worth noting that joblib files aren’t able to be queried directly by SQL. To do this, we would need to transform the joblib file to an SQL querable format such as JSON or CSV (out of scope for this workshop).
+ - Finally we want to return our dataframe, but create a new column indicating what rows were used for training and those for training.
+5. Viewing our output of this model:
+
+
+6. Let’s pop back over to Snowflake and check that our logistic regression model has been stored in our `MODELSTAGE` using the command:
+
+ ```sql
+ list @modelstage
+ ```
+
+
+
+7. To investigate the commands run as part of `train_test_position` script, navigate to Snowflake query history to view it **Activity > Query History**. We can view the portions of query that we wrote such as `create or replace stage MODELSTAGE`, but we also see additional queries that Snowflake uses to interpret python code.
+
+
+### Predicting on new data
+
+1. Create a new file called `predict_position` and copy and save the following code:
+
+ ```python
+ import logging
+ import joblib
+ import pandas as pd
+ import os
+ from snowflake.snowpark import types as T
+
+ DB_STAGE = 'MODELSTAGE'
+ version = '1.0'
+ # The name of the model file
+ model_file_path = 'driver_position_'+version
+ model_file_packaged = 'driver_position_'+version+'.joblib'
+
+ # This is a local directory, used for storing the various artifacts locally
+ LOCAL_TEMP_DIR = f'/tmp/driver_position'
+ DOWNLOAD_DIR = os.path.join(LOCAL_TEMP_DIR, 'download')
+ TARGET_MODEL_DIR_PATH = os.path.join(LOCAL_TEMP_DIR, 'ml_model')
+ TARGET_LIB_PATH = os.path.join(LOCAL_TEMP_DIR, 'lib')
+
+ # The feature columns that were used during model training
+ # and that will be used during prediction
+ FEATURE_COLS = [
+ "RACE_YEAR"
+ ,"CIRCUIT_NAME"
+ ,"GRID"
+ ,"CONSTRUCTOR_NAME"
+ ,"DRIVER"
+ ,"DRIVERS_AGE_YEARS"
+ ,"DRIVER_CONFIDENCE"
+ ,"CONSTRUCTOR_RELAIBLITY"
+ ,"TOTAL_PIT_STOPS_PER_RACE"]
+
+ def register_udf_for_prediction(p_predictor ,p_session ,p_dbt):
+
+ # The prediction udf
+
+ def predict_position(p_df: T.PandasDataFrame[int, int, int, int,
+ int, int, int, int, int]) -> T.PandasSeries[int]:
+ # Snowpark currently does not set the column name in the input dataframe
+ # The default col names are like 0,1,2,... Hence we need to reset the column
+ # names to the features that we initially used for training.
+ p_df.columns = [*FEATURE_COLS]
+
+ # Perform prediction. this returns an array object
+ pred_array = p_predictor.predict(p_df)
+ # Convert to series
+ df_predicted = pd.Series(pred_array)
+ return df_predicted
+
+ # The list of packages that will be used by UDF
+ udf_packages = p_dbt.config.get('packages')
+
+ predict_position_udf = p_session.udf.register(
+ predict_position
+ ,name=f'predict_position'
+ ,packages = udf_packages
+ )
+ return predict_position_udf
+
+ def download_models_and_libs_from_stage(p_session):
+ p_session.file.get(f'@{DB_STAGE}/{model_file_path}/{model_file_packaged}', DOWNLOAD_DIR)
+
+ def load_model(p_session):
+ # Load the model and initialize the predictor
+ model_fl_path = os.path.join(DOWNLOAD_DIR, model_file_packaged)
+ predictor = joblib.load(model_fl_path)
+ return predictor
+
+ # -------------------------------
+ def model(dbt, session):
+ dbt.config(
+ packages = ['snowflake-snowpark-python' ,'scipy','scikit-learn' ,'pandas' ,'numpy'],
+ materialized = "table",
+ tags = "predict"
+ )
+ session._use_scoped_temp_objects = False
+ download_models_and_libs_from_stage(session)
+ predictor = load_model(session)
+ predict_position_udf = register_udf_for_prediction(predictor, session ,dbt)
+
+ # Retrieve the data, and perform the prediction
+ hold_out_df = (dbt.ref("hold_out_dataset_for_prediction")
+ .select(*FEATURE_COLS)
+ )
+
+ # Perform prediction.
+ new_predictions_df = hold_out_df.withColumn("position_predicted"
+ ,predict_position_udf(*FEATURE_COLS)
+ )
+
+ return new_predictions_df
+ ```
+
+2. Execute the following in the command bar:
+
+ ```bash
+ dbt run --select predict_position
+ ```
+
+3. **Commit and push** our changes to keep saving our work as we go using the commit message `logistic regression model training and application` before moving on.
+4. At a high level in this script, we are:
+ - Retrieving our staged logistic regression model
+ - Loading the model in
+ - Placing the model within a user defined function (UDF) to call in line predictions on our driver’s position
+5. At a more detailed level:
+ - Import our libraries.
+ - Create variables to reference back to the `MODELSTAGE` we just created and stored our model to.
+ - The temporary file paths we created might look intimidating, but all we’re doing here is programmatically using an initial file path and adding to it to create the following directories:
+ - LOCAL_TEMP_DIR ➡️ /tmp/driver_position
+ - DOWNLOAD_DIR ➡️ /tmp/driver_position/download
+ - TARGET_MODEL_DIR_PATH ➡️ /tmp/driver_position/ml_model
+ - TARGET_LIB_PATH ➡️ /tmp/driver_position/lib
+ - Provide a list of our feature columns that we used for model training and will now be used on new data for prediction.
+ - Next, we are creating our main function `register_udf_for_prediction(p_predictor ,p_session ,p_dbt):`. This function is used to register a user-defined function (UDF) that performs the machine learning prediction. It takes three parameters: `p_predictor` is an instance of the machine learning model, `p_session` is an instance of the Snowflake session, and `p_dbt` is an instance of the dbt library. The function creates a UDF named `predict_churn` which takes a pandas dataframe with the input features and returns a pandas series with the predictions.
+ - ⚠️ Pay close attention to the whitespace here. We are using a function within a function for this script.
+ - We have 2 simple functions that are programmatically retrieving our file paths to first get our stored model out of our `MODELSTAGE` and downloaded into the session `download_models_and_libs_from_stage` and then to load the contents of our model in (parameters) in `load_model` to use for prediction.
+ - Take the model we loaded in and call it `predictor` and wrap it in a UDF.
+ - Return our dataframe with both the features used to predict and the new label.
+
+🧠 Another way to read this script is from the bottom up. This can help us progressively see what is going into our final dbt model and work backwards to see how the other functions are being referenced.
+
+6. Let’s take a look at our predicted position alongside our feature variables. Open a new scratchpad and use the following query. I chose to order by the prediction of who would obtain a podium position:
+
+ ```sql
+ select * from {{ ref('predict_position') }} order by position_predicted
+ ```
+
+7. We can see that we created predictions in our final dataset, we are ready to move on to testing!
+
+## Test your data models
+
+We have now completed building all the models for today’s lab, but how do we know if they meet our assertions? Put another way, how do we know the quality of our data models are any good? This brings us to testing!
+
+We test data models for mainly two reasons:
+
+- Ensure that our source data is clean on ingestion before we start data modeling/transformation (aka avoid garbage in, garbage out problem).
+- Make sure we don’t introduce bugs in the transformation code we wrote (stop ourselves from creating bad joins/fanouts).
+
+Testing in dbt comes in two flavors: [generic](/docs/build/tests#generic-tests) and [singular](/docs/build/tests#singular-tests).
+
+You define them in a test block (similar to a macro) and once defined, you can reference them by name in your `.yml` files (applying them to models, columns, sources, snapshots, and seeds).
+
+You might be wondering: *what about testing Python models?*
+
+Since the output of our Python models are tables, we can test SQL and Python models the same way! We don’t have to worry about any syntax differences when testing SQL versus Python data models. This means we use `.yml` and `.sql` files to test our entities (tables, views, etc.). Under the hood, dbt is running an SQL query on our tables to see if they meet assertions. If no rows are returned, dbt will surface a passed test. Conversely, if a test results in returned rows, it will fail or warn depending on the configuration (more on that later).
+
+### Generic tests
+
+1. To implement generic out-of-the-box tests dbt comes with, we can use YAML files to specify information about our models. To add generic tests to our aggregates model, create a file called `aggregates.yml`, copy the code block below into the file, and save.
+
+
+ ```yaml
+ version: 2
+
+ models:
+ - name: fastest_pit_stops_by_constructor
+ description: Use the python .describe() method to retrieve summary statistics table about pit stops by constructor. Sort by average stop time ascending so the first row returns the fastest constructor.
+ columns:
+ - name: constructor_name
+ description: team that makes the car
+ tests:
+ - unique
+
+ - name: lap_times_moving_avg
+ description: Use the python .rolling() method to calculate the 5 year rolling average of pit stop times alongside the average for each year.
+ columns:
+ - name: race_year
+ description: year of the race
+ tests:
+ - relationships:
+ to: ref('int_lap_times_years')
+ field: race_year
+ ```
+
+2. Let’s unpack the code we have here. We have both our aggregates models with the model name to know the object we are referencing and the description of the model that we’ll populate in our documentation. At the column level (a level below our model), we are providing the column name followed by our tests. We want to ensure our `constructor_name` is unique since we used a pandas `groupby` on `constructor_name` in the model `fastest_pit_stops_by_constructor`. Next, we want to ensure our `race_year` has referential integrity from the model we selected from `int_lap_times_years` into our subsequent `lap_times_moving_avg` model.
+3. Finally, if we want to see how tests were deployed on sources and SQL models, we can look at other files in our project such as the `f1_sources.yml` we created in our Sources and staging section.
+
+### Using macros for testing
+
+1. Under your `macros` folder, create a new file and name it `test_all_values_gte_zero.sql`. Copy the code block below and save the file. For clarity, “gte” is an abbreviation for greater than or equal to.
+
+
+ ```sql
+ {% macro test_all_values_gte_zero(table, column) %}
+
+ select * from {{ ref(table) }} where {{ column }} < 0
+
+ {% endmacro %}
+ ```
+
+2. Macros in Jinja are pieces of code that can be reused multiple times in our SQL models — they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models.
+3. We use the `{% macro %}` to indicate the start of the macro and `{% endmacro %}` for the end. The text after the beginning of the macro block is the name we are giving the macro to later call it. In this case, our macro is called `test_all_values_gte_zero`. Macros take in *arguments* to pass through, in this case the `table` and the `column`. In the body of the macro, we see an SQL statement that is using the `ref` function to dynamically select the table and then the column. You can always view macros without having to run them by using `dbt run-operation`. You can learn more [here](https://docs.getdbt.com/reference/commands/run-operation).
+4. Great, now we want to reference this macro as a test! Let’s create a new test file called `macro_pit_stops_mean_is_positive.sql` in our `tests` folder.
+
+
+
+5. Copy the following code into the file and save:
+
+ ```sql
+ {{
+ config(
+ enabled=true,
+ severity='warn',
+ tags = ['bi']
+ )
+ }}
+
+ {{ test_all_values_gte_zero('fastest_pit_stops_by_constructor', 'mean') }}
+ ```
+
+6. In our testing file, we are applying some configurations to the test including `enabled`, which is an optional configuration for disabling models, seeds, snapshots, and tests. Our severity is set to `warn` instead of `error`, which means our pipeline will still continue to run. We have tagged our test with `bi` since we are applying this test to one of our bi models.
+
+Then, in our final line, we are calling the `test_all_values_gte_zero` macro that takes in our table and column arguments and inputting our table `'fastest_pit_stops_by_constructor'` and the column `'mean'`.
+
+### Custom singular tests to validate Python models
+
+The simplest way to define a test is by writing the exact SQL that will return failing records. We call these "singular" tests, because they're one-off assertions usable for a single purpose.
+
+These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your test-paths config). You can use Jinja in SQL models (including ref and source) in the test definition, just like you can when creating models. Each `.sql` file contains one select statement, and it defines one test.
+
+Let’s add a custom test that asserts that the moving average of the lap time over the last 5 years is greater than zero (it’s impossible to have time less than 0!). It is easy to assume if this is not the case the data has been corrupted.
+
+1. Create a file `lap_times_moving_avg_assert_positive_or_null.sql` under the `tests` folder.
+
+
+2. Copy the following code and save the file:
+
+ ```sql
+ {{
+ config(
+ enabled=true,
+ severity='error',
+ tags = ['bi']
+ )
+ }}
+
+ with lap_times_moving_avg as ( select * from {{ ref('lap_times_moving_avg') }} )
+
+ select *
+ from lap_times_moving_avg
+ where lap_moving_avg_5_years < 0 and lap_moving_avg_5_years is not null
+ ```
+
+### Putting all our tests together
+
+1. Time to run our tests! Altogether, we have created 4 tests for our 2 Python models:
+ - `fastest_pit_stops_by_constructor`
+ - Unique `constructor_name`
+ - Lap times are greater than 0 or null (to allow for the first leading values in a rolling calculation)
+ - `lap_times_moving_avg`
+ - Referential test on `race_year`
+ - Mean pit stop times are greater than or equal to 0 (no negative time values)
+2. To run the tests on both our models, we can use this syntax in the command line to run them both at once, similar to how we did our data splits earlier.
+ Execute the following in the command bar:
+
+ ```bash
+ dbt test --select fastest_pit_stops_by_constructor lap_times_moving_avg
+ ```
+
+
+
+3. All 4 of our tests passed (yay for clean data)! To understand the SQL being run against each of our tables, we can click into the details of the test.
+4. Navigating into the **Details** of the `unique_fastest_pit_stops_by_constructor_name`, we can see that each line `constructor_name` should only have one row.
+
+
+## Document your dbt project
+
+When it comes to documentation, dbt brings together both column and model level descriptions that you can provide as well as details from your Snowflake information schema in a static site for consumption by other data team members and stakeholders.
+
+We are going to revisit 2 areas of our project to understand our documentation:
+
+- `intermediate.md` file
+- `dbt_project.yml` file
+
+To start, let’s look back at our `intermediate.md` file. We can see that we provided multi-line descriptions for the models in our intermediate models using [docs blocks](/docs/collaborate/documentation#using-docs-blocks). Then we reference these docs blocks in our `.yml` file. Building descriptions with doc blocks in Markdown files gives you the ability to format your descriptions with Markdown and are particularly helpful when building long descriptions, either at the column or model level. In our `dbt_project.yml`, we added `node_colors` at folder levels.
+
+1. To see all these pieces come together, execute this in the command bar:
+
+ ```bash
+ dbt docs generate
+ ```
+
+ This will generate the documentation for your project. Click the book button, as shown in the screenshot below to access the docs.
+
+
+2. Go to our project area and view `int_results`. View the description that we created in our doc block.
+
+
+3. View the mini-lineage that looks at the model we are currently selected on (`int_results` in this case).
+
+
+4. In our `dbt_project.yml`, we configured `node_colors` depending on the file directory. Starting in dbt v1.3, we can see how our lineage in our docs looks. By color coding your project, it can help you cluster together similar models or steps and more easily troubleshoot.
+
+
+
+## Deploy your code
+
+Before we jump into deploying our code, let's have a quick primer on environments. Up to this point, all of the work we've done in the dbt Cloud IDE has been in our development environment, with code committed to a feature branch and the models we've built created in our development schema in Snowflake as defined in our Development environment connection. Doing this work on a feature branch, allows us to separate our code from what other coworkers are building and code that is already deemed production ready. Building models in a development schema in Snowflake allows us to separate the database objects we might still be modifying and testing from the database objects running production dashboards or other downstream dependencies. Together, the combination of a Git branch and Snowflake database objects form our environment.
+
+Now that we've completed testing and documenting our work, we're ready to deploy our code from our development environment to our production environment and this involves two steps:
+
+- Promoting code from our feature branch to the production branch in our repository.
+ - Generally, the production branch is going to be named your main branch and there's a review process to go through before merging code to the main branch of a repository. Here we are going to merge without review for ease of this workshop.
+- Deploying code to our production environment.
+ - Once our code is merged to the main branch, we'll need to run dbt in our production environment to build all of our models and run all of our tests. This will allow us to build production-ready objects into our production environment in Snowflake. Luckily for us, the Partner Connect flow has already created our deployment environment and job to facilitate this step.
+
+1. Before getting started, let's make sure that we've committed all of our work to our feature branch. If you still have work to commit, you'll be able to select the **Commit and push**, provide a message, and then select **Commit** again.
+2. Once all of your work is committed, the git workflow button will now appear as **Merge to main**. Select **Merge to main** and the merge process will automatically run in the background.
+
+
+3. When it's completed, you should see the git button read **Create branch** and the branch you're currently looking at will become **main**.
+4. Now that all of our development work has been merged to the main branch, we can build our deployment job. Given that our production environment and production job were created automatically for us through Partner Connect, all we need to do here is update some default configurations to meet our needs.
+5. In the menu, select **Deploy** **> Environments**
+
+
+6. You should see two environments listed and you'll want to select the **Deployment** environment then **Settings** to modify it.
+7. Before making any changes, let's touch on what is defined within this environment. The Snowflake connection shows the credentials that dbt Cloud is using for this environment and in our case they are the same as what was created for us through Partner Connect. Our deployment job will build in our `PC_DBT_DB` database and use the default Partner Connect role and warehouse to do so. The deployment credentials section also uses the info that was created in our Partner Connect job to create the credential connection. However, it is using the same default schema that we've been using as the schema for our development environment.
+8. Let's update the schema to create a new schema specifically for our production environment. Click **Edit** to allow you to modify the existing field values. Navigate to **Deployment Credentials >** **schema.**
+9. Update the schema name to **production**. Remember to select **Save** after you've made the change.
+
+10. By updating the schema for our production environment to **production**, it ensures that our deployment job for this environment will build our dbt models in the **production** schema within the `PC_DBT_DB` database as defined in the Snowflake Connection section.
+11. Now let's switch over to our production job. Click on the deploy tab again and then select **Jobs**. You should see an existing and preconfigured **Partner Connect Trial Job**. Similar to the environment, click on the job, then select **Settings** to modify it. Let's take a look at the job to understand it before making changes.
+
+ - The Environment section is what connects this job with the environment we want it to run in. This job is already defaulted to use the Deployment environment that we just updated and the rest of the settings we can keep as is.
+ - The Execution settings section gives us the option to generate docs, run source freshness, and defer to a previous run state. For the purposes of our lab, we're going to keep these settings as is as well and stick with just generating docs.
+ - The Commands section is where we specify exactly which commands we want to run during this job, and we also want to keep this as is. We want our seed to be uploaded first, then run our models, and finally test them. The order of this is important as well, considering that we need our seed to be created before we can run our incremental model, and we need our models to be created before we can test them.
+ - Finally, we have the Triggers section, where we have a number of different options for scheduling our job. Given that our data isn't updating regularly here and we're running this job manually for now, we're also going to leave this section alone.
+
+ So, what are we changing then? Just the name! Click **Edit** to allow you to make changes. Then update the name of the job to **Production Job** to denote this as our production deployment job. After that's done, click **Save**.
+12. Now let's go to run our job. Clicking on the job name in the path at the top of the screen will take you back to the job run history page where you'll be able to click **Run run** to kick off the job. If you encounter any job failures, try running the job again before further troubleshooting.
+
+
+
+13. Let's go over to Snowflake to confirm that everything built as expected in our production schema. Refresh the database objects in your Snowflake account and you should see the production schema now within our default Partner Connect database. If you click into the schema and everything ran successfully, you should be able to see all of the models we developed.
+
+
+### Conclusion
+
+Fantastic! You’ve finished the workshop! We hope you feel empowered in using both SQL and Python in your dbt Cloud workflows with Snowflake. Having a reliable pipeline to surface both analytics and machine learning is crucial to creating tangible business value from your data.
+
+For more help and information join our [dbt community Slack](https://www.getdbt.com/community/) which contains more than 50,000 data practitioners today. We have a dedicated slack channel #db-snowflake to Snowflake related content. Happy dbt'ing!
diff --git a/website/docs/guides/best-practices/debugging-errors.md b/website/docs/guides/debug-errors.md
similarity index 90%
rename from website/docs/guides/best-practices/debugging-errors.md
rename to website/docs/guides/debug-errors.md
index 39670820ddd..febfb6ac422 100644
--- a/website/docs/guides/best-practices/debugging-errors.md
+++ b/website/docs/guides/debug-errors.md
@@ -1,13 +1,18 @@
---
-title: "Debugging errors"
-id: "debugging-errors"
+title: "Debug errors"
+id: "debug-errors"
description: Learn about errors and the art of debugging them.
displayText: Debugging errors
hoverSnippet: Learn about errors and the art of debugging those errors.
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Troubleshooting', 'dbt Core', 'dbt Cloud']
+level: 'Beginner'
+recently_updated: true
---
-
## General process of debugging
+
Learning how to debug is a skill, and one that will make you great at your role!
1. Read the error message — when writing the code behind dbt, we try our best to make error messages as useful as we can. The error message dbt produces will normally contain the type of error (more on these error types below), and the file where the error occurred.
2. Inspect the file that was known to cause the issue, and see if there's an immediate fix.
@@ -17,7 +22,7 @@ Learning how to debug is a skill, and one that will make you great at your role!
- The `target/run` directory contains the SQL dbt executes to build your models.
- The `logs/dbt.log` file contains all the queries that dbt runs, and additional logging. Recent errors will be at the bottom of the file.
- **dbt Cloud users**: Use the above, or the `Details` tab in the command output.
- - **dbt CLI users**: Note that your code editor _may_ be hiding these files from the tree [VSCode help](https://stackoverflow.com/questions/42891463/how-can-i-show-ignored-files-in-visual-studio-code)).
+ - **dbt Core users**: Note that your code editor _may_ be hiding these files from the tree [VSCode help](https://stackoverflow.com/questions/42891463/how-can-i-show-ignored-files-in-visual-studio-code)).
5. If you are really stuck, try [asking for help](/community/resources/getting-help). Before doing so, take the time to write your question well so that others can diagnose the problem quickly.
@@ -184,7 +189,7 @@ hello: world # this is not allowed
## Compilation Errors
-_Note: if you're using the dbt Cloud IDE to work on your dbt project, this error often shows as a red bar in your command prompt as you work on your dbt project. For dbt CLI users, these won't get picked up until you run `dbt run` or `dbt compile`._
+_Note: if you're using the dbt Cloud IDE to work on your dbt project, this error often shows as a red bar in your command prompt as you work on your dbt project. For dbt Core users, these won't get picked up until you run `dbt run` or `dbt compile`._
### Invalid `ref` function
@@ -228,7 +233,7 @@ To fix this:
- Use the error message to find your mistake
To prevent this:
-- _(dbt CLI users only)_ Use snippets to auto-complete pieces of Jinja ([atom-dbt package](https://github.com/dbt-labs/atom-dbt), [vscode-dbt extestion](https://marketplace.visualstudio.com/items?itemName=bastienboutonnet.vscode-dbt))
+- _(dbt Core users only)_ Use snippets to auto-complete pieces of Jinja ([atom-dbt package](https://github.com/dbt-labs/atom-dbt), [vscode-dbt extestion](https://marketplace.visualstudio.com/items?itemName=bastienboutonnet.vscode-dbt))
@@ -280,7 +285,7 @@ To fix this:
- Find the mistake and fix it
To prevent this:
-- (dbt CLI users) Turn on indentation guides in your code editor to help you inspect your files
+- (dbt Core users) Turn on indentation guides in your code editor to help you inspect your files
- Use a YAML validator ([example](http://www.yamllint.com/)) to debug any issues
@@ -341,10 +346,10 @@ Database Error in model customers (models/customers.sql)
90% of the time, there's a mistake in the SQL of your model. To fix this:
1. Open the offending file:
- **dbt Cloud:** Open the model (in this case `models/customers.sql` as per the error message)
- - **dbt CLI:** Open the model as above. Also open the compiled SQL (in this case `target/run/jaffle_shop/models/customers.sql` as per the error message) — it can be useful to show these side-by-side in your code editor.
+ - **dbt Core:** Open the model as above. Also open the compiled SQL (in this case `target/run/jaffle_shop/models/customers.sql` as per the error message) — it can be useful to show these side-by-side in your code editor.
2. Try to re-execute the SQL to isolate the error:
- **dbt Cloud:** Use the `Preview` button from the model file
- - **dbt CLI:** Copy and paste the compiled query into a query runner (e.g. the Snowflake UI, or a desktop app like DataGrip / TablePlus) and execute it
+ - **dbt Core:** Copy and paste the compiled query into a query runner (e.g. the Snowflake UI, or a desktop app like DataGrip / TablePlus) and execute it
3. Fix the mistake.
4. Rerun the failed model.
@@ -356,7 +361,7 @@ In some cases, these errors might occur as a result of queries that dbt runs "be
In these cases, you should check out the logs — this contains _all_ the queries dbt has run.
- **dbt Cloud**: Use the `Details` in the command output to see logs, or check the `logs/dbt.log` file
-- **dbt CLI**: Open the `logs/dbt.log` file.
+- **dbt Core**: Open the `logs/dbt.log` file.
:::tip Isolating errors in the logs
If you're hitting a strange `Database Error`, it can be a good idea to clean out your logs by opening the file, and deleting the contents. Then, re-execute `dbt run` for _just_ the problematic model. The logs will _just_ have the output you're looking for.
@@ -379,6 +384,6 @@ Using the `Preview` button is useful when developing models and you want to visu
We’ve all been there. dbt uses the last-saved version of a file when you execute a command. In most code editors, and in the dbt Cloud IDE, a dot next to a filename indicates that a file has unsaved changes. Make sure you hit `cmd + s` (or equivalent) before running any dbt commands — over time it becomes muscle memory.
### Editing compiled files
-_(More likely for dbt CLI users)_
+_(More likely for dbt Core users)_
If you just opened a SQL file in the `target/` directory to help debug an issue, it's not uncommon to accidentally edit that file! To avoid this, try changing your code editor settings to grey out any files in the `target/` directory — the visual cue will help avoid the issue.
diff --git a/website/docs/guides/legacy/debugging-schema-names.md b/website/docs/guides/debug-schema-names.md
similarity index 81%
rename from website/docs/guides/legacy/debugging-schema-names.md
rename to website/docs/guides/debug-schema-names.md
index 6c869b5f8af..c7bf1a195b1 100644
--- a/website/docs/guides/legacy/debugging-schema-names.md
+++ b/website/docs/guides/debug-schema-names.md
@@ -1,7 +1,19 @@
---
-title: Debugging schema names
+title: Debug schema names
+id: debug-schema-names
+description: Learn how to debug schema names when models build under unexpected schemas.
+displayText: Debug schema names
+hoverSnippet: Learn how to debug schema names in dbt.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['dbt Core','Troubleshooting']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
+
If a model uses the [`schema` config](/reference/resource-properties/schema) but builds under an unexpected schema, here are some steps for debugging the issue.
:::info
@@ -12,10 +24,10 @@ You can also follow along via this video:
-### 1. Search for a macro named `generate_schema_name`
+## Search for a macro named `generate_schema_name`
Do a file search to check if you have a macro named `generate_schema_name` in the `macros` directory of your project.
-#### I do not have a macro named `generate_schema_name` in my project
+### You do not have a macro named `generate_schema_name` in your project
This means that you are using dbt's default implementation of the macro, as defined [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47C1-L60)
```sql
@@ -37,15 +49,14 @@ This means that you are using dbt's default implementation of the macro, as defi
Note that this logic is designed so that two dbt users won't accidentally overwrite each other's work by writing to the same schema.
-#### I have a `generate_schema_name` macro in my project that calls another macro
+### You have a `generate_schema_name` macro in a project that calls another macro
If your `generate_schema_name` macro looks like so:
```sql
{% macro generate_schema_name(custom_schema_name, node) -%}
{{ generate_schema_name_for_env(custom_schema_name, node) }}
{%- endmacro %}
```
-Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/include/global_project/macros/etc/get_custom_schema.sql#L43-L56).
-
+Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47-L60).
```sql
{% macro generate_schema_name_for_env(custom_schema_name, node) -%}
@@ -62,22 +73,22 @@ Your project is switching out the `generate_schema_name` macro for another macro
{%- endmacro %}
```
-#### I have a `generate_schema_name` macro with custom logic
+### You have a `generate_schema_name` macro with custom logic
If this is the case — it might be a great idea to reach out to the person who added this macro to your project, as they will have context here — you can use [GitHub's blame feature](https://docs.github.com/en/free-pro-team@latest/github/managing-files-in-a-repository/tracking-changes-in-a-file) to do this.
In all cases take a moment to read through the Jinja to see if you can follow the logic.
-### 2. Confirm your `schema` config
+## Confirm your `schema` config
Check if you are using the [`schema` config](/reference/resource-properties/schema) in your model, either via a `{{ config() }}` block, or from `dbt_project.yml`. In both cases, dbt passes this value as the `custom_schema_name` parameter of the `generate_schema_name` macro.
-### 3. Confirm your target values
+## Confirm your target values
Most `generate_schema_name` macros incorporate logic from the [`target` variable](/reference/dbt-jinja-functions/target), in particular `target.schema` and `target.name`. Use the docs [here](/reference/dbt-jinja-functions/target) to help you find the values of each key in this dictionary.
-### 4. Put the two together
+## Put the two together
Now, re-read through the logic of your `generate_schema_name` macro, and mentally plug in your `customer_schema_name` and `target` values.
@@ -87,7 +98,7 @@ You should find that the schema dbt is constructing for your model matches the o
Note that snapshots do not follow this behavior, check out the docs on [target_schema](/reference/resource-configs/target_schema) instead.
:::
-### 5. Adjust as necessary
+## Adjust as necessary
Now that you understand how a model's schema is being generated, you can adjust as necessary:
- You can adjust the logic in your `generate_schema_name` macro (or add this macro to your project if you don't yet have one and adjust from there)
diff --git a/website/docs/guides/dremio-lakehouse.md b/website/docs/guides/dremio-lakehouse.md
new file mode 100644
index 00000000000..1c59c04d175
--- /dev/null
+++ b/website/docs/guides/dremio-lakehouse.md
@@ -0,0 +1,196 @@
+---
+title: Build a data lakehouse with dbt Core and Dremio Cloud
+id: build-dremio-lakehouse
+description: Learn how to build a data lakehouse with dbt Core and Dremio Cloud.
+displayText: Build a data lakehouse with dbt Core and Dremio Cloud
+hoverSnippet: Learn how to build a data lakehouse with dbt Core and Dremio Cloud
+# time_to_complete: '30 minutes' commenting out until we test
+platform: 'dbt-core'
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Dremio', 'dbt Core']
+level: 'Intermediate'
+recently_updated: true
+---
+## Introduction
+
+This guide will demonstrate how to build a data lakehouse with dbt Core 1.5 or new and Dremio Cloud. You can simplify and optimize your data infrastructure with dbt's robust transformation framework and Dremio’s open and easy data lakehouse. The integrated solution empowers companies to establish a strong data and analytics foundation, fostering self-service analytics and enhancing business insights while simplifying operations by eliminating the necessity to write complex Extract, Transform, and Load (ETL) pipelines.
+
+### Prerequisites
+
+* You must have a [Dremio Cloud](https://docs.dremio.com/cloud/) account.
+* You must have Python 3 installed.
+* You must have dbt Core v1.5 or newer [installed](/docs/core/installation).
+* You must have the Dremio adapter 1.5.0 or newer [installed and configured](/docs/core/connect-data-platform/dremio-setup) for Dremio Cloud.
+* You must have basic working knowledge of Git and the command line interface (CLI).
+
+## Validate your environment
+
+Validate your environment by running the following commands in your CLI and verifying the results:
+
+```shell
+
+$ python3 --version
+Python 3.11.4 # Must be Python 3
+
+```
+
+```shell
+
+$ dbt --version
+Core:
+ - installed: 1.5.0 # Must be 1.5 or newer
+ - latest: 1.6.3 - Update available!
+
+ Your version of dbt-core is out of date!
+ You can find instructions for upgrading here:
+ https://docs.getdbt.com/docs/installation
+
+Plugins:
+ - dremio: 1.5.0 - Up to date! # Must be 1.5 or newer
+
+```
+
+## Getting started
+
+1. Clone the Dremio dbt Core sample project from the [GitHub repo](https://github.com/dremio-brock/DremioDBTSample/tree/master/dremioSamples).
+
+2. In your integrated development environment (IDE), open the relation.py file in the Dremio adapter directory:
+ `$HOME/Library/Python/3.9/lib/python/site-packages/dbt/adapters/dremio/relation.py`
+
+3. Find and update lines 51 and 52 to match the following syntax:
+
+```python
+
+PATTERN = re.compile(r"""((?:[^."']|"[^"]*"|'[^']*')+)""")
+return ".".join(PATTERN.split(identifier)[1::2])
+
+```
+
+The complete selection should look like this:
+
+```python
+def quoted_by_component(self, identifier, componentName):
+ if componentName == ComponentName.Schema:
+ PATTERN = re.compile(r"""((?:[^."']|"[^"]*"|'[^']*')+)""")
+ return ".".join(PATTERN.split(identifier)[1::2])
+ else:
+ return self.quoted(identifier)
+
+```
+
+You need to update this pattern because the plugin doesn’t support schema names in Dremio containing dots and spaces.
+
+## Build your pipeline
+
+1. Create a `profiles.yml` file in the `$HOME/.dbt/profiles.yml` path and add the following configs:
+
+```yaml
+
+dremioSamples:
+ outputs:
+ cloud_dev:
+ dremio_space: dev
+ dremio_space_folder: no_schema
+ object_storage_path: dev
+ object_storage_source: $scratch
+ pat:
+ cloud_host: api.dremio.cloud
+ cloud_project_id:
+ threads: 1
+ type: dremio
+ use_ssl: true
+ user:
+ target: dev
+
+ ```
+
+ 2. Execute the transformation pipeline:
+
+ ```shell
+
+ $ dbt run -t cloud_dev
+
+ ```
+
+ If the above configurations have been implemented, the output will look something like this:
+
+```shell
+
+17:24:16 Running with dbt=1.5.0
+17:24:17 Found 5 models, 0 tests, 0 snapshots, 0 analyses, 348 macros, 0 operations, 0 seed files, 2 sources, 0 exposures, 0 metrics, 0 groups
+17:24:17
+17:24:29 Concurrency: 1 threads (target='cloud_dev')
+17:24:29
+17:24:29 1 of 5 START sql view model Preparation.trips .................................. [RUN]
+17:24:31 1 of 5 OK created sql view model Preparation. trips ............................. [OK in 2.61s]
+17:24:31 2 of 5 START sql view model Preparation.weather ................................ [RUN]
+17:24:34 2 of 5 OK created sql view model Preparation.weather ........................... [OK in 2.15s]
+17:24:34 3 of 5 START sql view model Business.Transportation.nyc_trips .................. [RUN]
+17:24:36 3 of 5 OK created sql view model Business.Transportation.nyc_trips ............. [OK in 2.18s]
+17:24:36 4 of 5 START sql view model Business.Weather.nyc_weather ....................... [RUN]
+17:24:38 4 of 5 OK created sql view model Business.Weather.nyc_weather .................. [OK in 2.09s]
+17:24:38 5 of 5 START sql view model Application.nyc_trips_with_weather ................. [RUN]
+17:24:41 5 of 5 OK created sql view model Application.nyc_trips_with_weather ............ [OK in 2.74s]
+17:24:41
+17:24:41 Finished running 5 view models in 0 hours 0 minutes and 24.03 seconds (24.03s).
+17:24:41
+17:24:41 Completed successfully
+17:24:41
+17:24:41 Done. PASS=5 WARN=0 ERROR=0 SKIP=0 TOTAL=5
+
+```
+
+Now that you have a running environment and a completed job, you can view the data in Dremio and expand your code. This is a snapshot of the project structure in an IDE:
+
+
+
+## About the schema.yml
+
+The `schema.yml` file defines Dremio sources and models to be used and what data models are in scope. In this guides sample project, there are two data sources:
+
+1. The `NYC-weather.csv` stored in the **Samples** database and
+2. The `sample_data` from the **Samples database**.
+
+The models correspond to both weather and trip data respectively and will be joined for analysis.
+
+The sources can be found by navigating to the **Object Storage** section of the Dremio Cloud UI.
+
+
+
+## About the models
+
+**Preparation** — `preparation_trips.sql` and `preparation_weather.sql` are building views on top of the trips and weather data.
+
+**Business** — `business_transportation_nyc_trips.sql` applies some level of transformation on `preparation_trips.sql` view. `Business_weather_nyc.sql` has no transformation on the `preparation_weather.sql` view.
+
+**Application** — `application_nyc_trips_with_weather.sql` joins the output from the Business model. This is what your business users will consume.
+
+## The Job output
+
+When you run the dbt job, it will create a **dev** space folder that has all the data assets created. This is what you will see in Dremio Cloud UI. Spaces in Dremio is a way to organize data assets which map to business units or data products.
+
+
+
+Open the **Application folder** and you will see the output of the simple transformation we did using dbt.
+
+
+
+## Query the data
+
+Now that you have run the job and completed the transformation, it's time to query your data. Click on the `nyc_trips_with_weather` view. That will take you to the SQL Runner page. Click **Show SQL Pane** on the upper right corner of the page.
+
+Run the following query:
+
+```sql
+
+SELECT vendor_id,
+ AVG(tip_amount)
+FROM dev.application."nyc_treips_with_weather"
+GROUP BY vendor_id
+
+```
+
+
+
+This completes the integration setup and data is ready for business consumption.
\ No newline at end of file
diff --git a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md b/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md
similarity index 94%
rename from website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md
rename to website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md
index 692106655ac..30221332355 100644
--- a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md
+++ b/website/docs/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md
@@ -4,19 +4,27 @@ id: how-to-use-databricks-workflows-to-run-dbt-cloud-jobs
description: Learn how to use Databricks workflows to run dbt Cloud jobs
displayText: "Use Databricks workflows to run dbt Cloud jobs"
hoverSnippet: Learn how to use Databricks workflows to run dbt Cloud jobs
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'databricks'
+hide_table_of_contents: true
+tags: ['Databricks', 'dbt Core','dbt Cloud','Orchestration']
+level: 'Intermediate'
+recently_updated: true
---
+## Introduction
+
Using Databricks workflows to call the dbt Cloud job API can be useful for several reasons:
1. **Integration with other ETL processes** — If you're already running other ETL processes in Databricks, you can use a Databricks workflow to trigger a dbt Cloud job after those processes are done.
-2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/dbt-cloud-job).
+2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/deploy-jobs).
3. [**Separation of concerns —**](https://en.wikipedia.org/wiki/Separation_of_concerns) Detailed logs for dbt jobs in the dbt Cloud environment can lead to more modularity and efficient debugging. By doing so, it becomes easier to isolate bugs quickly while still being able to see the overall status in Databricks.
4. **Custom job triggering —** Use a Databricks workflow to trigger dbt Cloud jobs based on custom conditions or logic that aren't natively supported by dbt Cloud's scheduling feature. This can give you more flexibility in terms of when and how your dbt Cloud jobs run.
-## Prerequisites
+### Prerequisites
- Active [Teams or Enterprise dbt Cloud account](https://www.getdbt.com/pricing/)
-- You must have a configured and existing [dbt Cloud job](/docs/deploy/dbt-cloud-job)
+- You must have a configured and existing [dbt Cloud deploy job](/docs/deploy/deploy-jobs)
- Active Databricks account with access to [Data Science and Engineering workspace](https://docs.databricks.com/workspace-index.html) and [Manage secrets](https://docs.databricks.com/security/secrets/index.html)
- [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html)
- **Note**: You only need to set up your authentication. Once you have set up your Host and Token and are able to run `databricks workspace ls /Users/`, you can proceed with the rest of this guide.
@@ -29,7 +37,7 @@ To use Databricks workflows for running dbt Cloud jobs, you need to perform the
- [Create a Databricks Python notebook](#create-a-databricks-python-notebook)
- [Configure the workflows to run the dbt Cloud jobs](#configure-the-workflows-to-run-the-dbt-cloud-jobs)
-### Set up a Databricks secret scope
+## Set up a Databricks secret scope
1. Retrieve **[User API Token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens#user-api-tokens) **or **[Service Account Token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#generating-service-account-tokens) **from dbt Cloud
2. Set up a **Databricks secret scope**, which is used to securely store your dbt Cloud API key.
@@ -47,7 +55,7 @@ databricks secrets put --scope --key --s
5. Replace **``** with the actual API key value that you copied from dbt Cloud in step 1.
-### Create a Databricks Python notebook
+## Create a Databricks Python notebook
1. [Create a **Databricks Python notebook**](https://docs.databricks.com/notebooks/notebooks-manage.html), which executes a Python script that calls the dbt Cloud job API.
@@ -165,7 +173,7 @@ DbtJobRunStatus.SUCCESS
You can cancel the job from dbt Cloud if necessary.
:::
-### Configure the workflows to run the dbt Cloud jobs
+## Configure the workflows to run the dbt Cloud jobs
You can set up workflows directly from the notebook OR by adding this notebook to one of your existing workflows:
@@ -206,6 +214,4 @@ You can set up workflows directly from the notebook OR by adding this notebook t
Multiple Workflow tasks can be set up using the same notebook by configuring the `job_id` parameter to point to different dbt Cloud jobs.
-## Closing
-
Using Databricks workflows to access the dbt Cloud job API can improve integration of your data pipeline processes and enable scheduling of more complex workflows.
diff --git a/website/docs/guides/legacy/creating-date-partitioned-tables.md b/website/docs/guides/legacy/creating-date-partitioned-tables.md
deleted file mode 100644
index 8c461dbe4a8..00000000000
--- a/website/docs/guides/legacy/creating-date-partitioned-tables.md
+++ /dev/null
@@ -1,117 +0,0 @@
----
-title: "BigQuery: Creating date-partitioned tables"
-id: "creating-date-partitioned-tables"
----
-
-
-:::caution Deprecated
-
-The functionality described below was introduced in dbt Core v0.10 (March 2018). In v1.0 (December 2021), it was deprecated in favor of [column-based partitioning](/reference/resource-configs/bigquery-configs#partition-clause) and [incremental modeling](/docs/build/incremental-models).
-
-:::
-
-dbt supports the creation of [date partitioned tables](https://cloud.google.com/bigquery/docs/partitioned-tables) in BigQuery.
-
-To configure a dbt model as a date partitioned , use the `materialized='table'` model configuration in conjunction with a list of `partitions`. dbt will execute your model query once for each specified partition. For example:
-
-
-
-```sql
-{{
- config(
- materialized='table',
- partitions=[20180101, 20180102],
- verbose=True
- )
-}}
-
-/*
-
-dbt will interpolate each `partition` wherever it finds [DBT__PARTITION_DATE]
-in your model code. This model will create a single table with two partitions:
- 1. 20180101
- 2. 20180102
-
-These partitions will be created by running the following query against
-each of the following date-sharded tables:
-
- 1. `snowplow`.`events_20180101`
- 2. `snowplow`.`events_20180102`
-
-*/
-
-select *
-from `snowplow`.`events_[DBT__PARTITION_DATE]`
-```
-
-
-
-To make this model more dynamic, we can use the `dbt.partition_range` macro to generate a list of 8-digit dates in a specified range. Further, dbt provides a handy macro, `date_sharded_table`, for getting a date-sharded by its prefix for a given date. Together, this looks like:
-
-
-
-```sql
-{{
- config(
- materialized='table',
- partitions=dbt.partition_range('20180101, 20180201'),
- verbose=True
- )
-}}
-
--- This model creates a date-partitioned table. There will be one
--- partition for each day between 20180101 and 20180201, inclusive.
--- The `date_sharded_table` macro below is sugar around [DBT__PARTITION_DATE]
-
-select *
-from `snowplow`.`{{ date_sharded_table('events_') }}`
-```
-
-
-
-Finally, it's frequently desirable to only update a date partitioned table for the last day of received data. This can be implemented using the above configurations in conjunction with a clever macro and some [command line variables](/docs/build/project-variables).
-
-First, the macro:
-
-
-
-```sql
-{% macro yesterday() %}
-
- {% set today = modules.datetime.date.today() %}
- {% set one_day = modules.datetime.timedelta(days=1) %}
- {% set yesterday = (today - one_day) %}
-
- {{ return(yesterday.strftime("%Y%m%d")) }}
-
-{% endmacro %}
-```
-
-
-
-Next, use it in the model:
-
-
-
-```sql
-{{
- config(
- materialized='table',
- partitions=dbt.partition_range(var('dates', default=yesterday())),
- verbose=True
- )
-}}
-
-select *
-from `snowplow`.`{{ date_sharded_table('events_') }}`
-```
-
-
-
-If a `dates` variable is provided (eg. on the command line with `--vars`), then dbt will create the partitions for that date range. Otherwise, dbt will create a partition for `yesterday`, overwriting it if it already exists.
-
-Here's an example of running this model for the first 3 days of 2018 as a part of a backfill:
-
-```
-dbt run --select partitioned_yesterday --vars 'dates: "20180101, 20180103"'
-```
diff --git a/website/docs/guides/legacy/videos.md b/website/docs/guides/legacy/videos.md
deleted file mode 100644
index 863029ff6d9..00000000000
--- a/website/docs/guides/legacy/videos.md
+++ /dev/null
@@ -1,13 +0,0 @@
----
-title: "Videos 🎥"
-id: "videos"
----
-
-Check out some cool videos about using and deploying dbt!
-
-## dbt tutorial (February, 2017)
-
-
-
-## dbt docs demo with GitLab (September, 2018)
-
diff --git a/website/docs/guides/manual-install-qs.md b/website/docs/guides/manual-install-qs.md
new file mode 100644
index 00000000000..61796fe008a
--- /dev/null
+++ b/website/docs/guides/manual-install-qs.md
@@ -0,0 +1,468 @@
+---
+title: "Quickstart for dbt Core from a manual install"
+id: manual-install
+description: "Connecting your warehouse to dbt Core using the CLI."
+level: 'Beginner'
+platform: 'dbt-core'
+icon: 'fa-light fa-square-terminal'
+tags: ['dbt Core','Quickstart']
+hide_table_of_contents: true
+---
+## Introduction
+
+When you use dbt Core to work with dbt, you will be editing files locally using a code editor, and running projects using a command line interface (CLI). If you'd rather edit files and run projects using the web-based Integrated Development Environment (IDE), you should refer to the [dbt Cloud quickstarts](/guides). You can also develop and run dbt commands using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) — a dbt Cloud powered command line.
+
+### Prerequisites
+
+* To use dbt Core, it's important that you know some basics of the Terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily.
+* Install dbt Core using the [installation instructions](/docs/core/installation) for your operating system.
+* Complete [Setting up (in BigQuery)](/guides/bigquery?step=2) and [Loading data (BigQuery)](/guides/bigquery?step=3).
+* [Create a GitHub account](https://github.com/join) if you don't already have one.
+
+### Create a starter project
+
+After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models.
+
+## Create a repository
+
+The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join).
+
+1. [Create a new GitHub repository](https://github.com/new) named `dbt-tutorial`.
+2. Select **Public** so the repository can be shared with others. You can always make it private later.
+3. Leave the default values for all other settings.
+4. Click **Create repository**.
+5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes).
+
+## Create a project
+
+Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project.
+
+To create your dbt project:
+
+1. Make sure you have dbt Core installed and check the version using the `dbt --version` command:
+
+```shell
+dbt --version
+```
+
+2. Initiate the `jaffle_shop` project using the `init` command:
+
+```shell
+dbt init jaffle_shop
+```
+
+3. Navigate into your project's directory:
+
+```shell
+cd jaffle_shop
+```
+
+4. Use `pwd` to confirm that you are in the right spot:
+
+```shell
+$ pwd
+> Users/BBaggins/dbt-tutorial/jaffle_shop
+```
+
+5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command.
+
+
+
+
+
+6. Update the following values in the `dbt_project.yml` file:
+
+
+
+```yaml
+name: jaffle_shop # Change from the default, `my_new_project`
+
+...
+
+profile: jaffle_shop # Change from the default profile name, `default`
+
+...
+
+models:
+ jaffle_shop: # Change from `my_new_project` to match the previous value for `name:`
+ ...
+```
+
+
+
+## Connect to BigQuery
+
+When developing locally, dbt connects to your using a [profile](/docs/core/connect-data-platform/connection-profiles), which is a YAML file with all the connection details to your warehouse.
+
+1. Create a file in the `~/.dbt/` directory named `profiles.yml`.
+2. Move your BigQuery keyfile into this directory.
+3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted.
+
+
+
+```yaml
+jaffle_shop: # this needs to match the profile in your dbt_project.yml file
+ target: dev
+ outputs:
+ dev:
+ type: bigquery
+ method: service-account
+ keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile
+ project: grand-highway-265418 # Replace this with your project id
+ dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo
+ threads: 1
+ timeout_seconds: 300
+ location: US
+ priority: interactive
+```
+
+
+
+4. Run the `debug` command from your project to confirm that you can successfully connect:
+
+```shell
+$ dbt debug
+> Connection test: OK connection ok
+```
+
+
+
+
+
+### FAQs
+
+
+
+
+
+
+
+## Perform your first dbt run
+
+Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order.
+
+1. Enter the `run` command to build example models:
+
+```shell
+dbt run
+```
+
+You should have an output that looks like this:
+
+
+
+
+
+## Commit your changes
+
+Commit your changes so that the repository contains the latest code.
+
+1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository).
+
+```shell
+git init
+git branch -M main
+git add .
+git commit -m "Create a dbt project"
+git remote add origin https://github.com/USERNAME/dbt-tutorial.git
+git push -u origin main
+```
+
+2. Return to your GitHub repository to verify your new files have been added.
+
+### Build your first models
+
+Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)!
+In the next steps, you will take a sample query and turn it into a model in your dbt project.
+
+## Checkout a new git branch
+
+Check out a new git branch to work on new code:
+
+1. Create a new branch by using the `checkout` command and passing the `-b` flag:
+
+```shell
+$ git checkout -b add-customers-model
+> Switched to a new branch `add-customer-model`
+```
+
+## Build your first model
+
+
+1. Open your project in your favorite code editor.
+2. Create a new SQL file in the `models` directory, named `models/customers.sql`.
+3. Paste the following query into the `models/customers.sql` file.
+
+
+
+4. From the command line, enter `dbt run`.
+
+
+
+
+When you return to the BigQuery console, you can `select` from this model.
+
+### FAQs
+
+
+
+
+
+
+
+## Change the way your model is materialized
+
+
+
+
+
+## Delete the example models
+
+
+
+## Build models on top of other models
+
+
+
+1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query.
+2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query.
+
+
+
+
+
+
+
+```sql
+select
+ id as customer_id,
+ first_name,
+ last_name
+
+from `dbt-tutorial`.jaffle_shop.customers
+```
+
+
+
+
+
+```sql
+select
+ id as order_id,
+ user_id as customer_id,
+ order_date,
+ status
+
+from `dbt-tutorial`.jaffle_shop.orders
+```
+
+
+
+
+
+
+
+
+
+```sql
+select
+ id as customer_id,
+ first_name,
+ last_name
+
+from jaffle_shop_customers
+```
+
+
+
+
+
+```sql
+select
+ id as order_id,
+ user_id as customer_id,
+ order_date,
+ status
+
+from jaffle_shop_orders
+```
+
+
+
+
+
+
+
+
+
+```sql
+select
+ id as customer_id,
+ first_name,
+ last_name
+
+from jaffle_shop.customers
+```
+
+
+
+
+
+```sql
+select
+ id as order_id,
+ user_id as customer_id,
+ order_date,
+ status
+
+from jaffle_shop.orders
+```
+
+
+
+
+
+
+
+
+
+```sql
+select
+ id as customer_id,
+ first_name,
+ last_name
+
+from raw.jaffle_shop.customers
+```
+
+
+
+
+
+```sql
+select
+ id as order_id,
+ user_id as customer_id,
+ order_date,
+ status
+
+from raw.jaffle_shop.orders
+```
+
+
+
+
+
+
+
+3. Edit the SQL in your `models/customers.sql` file as follows:
+
+
+
+```sql
+with customers as (
+
+ select * from {{ ref('stg_customers') }}
+
+),
+
+orders as (
+
+ select * from {{ ref('stg_orders') }}
+
+),
+
+customer_orders as (
+
+ select
+ customer_id,
+
+ min(order_date) as first_order_date,
+ max(order_date) as most_recent_order_date,
+ count(order_id) as number_of_orders
+
+ from orders
+
+ group by 1
+
+),
+
+final as (
+
+ select
+ customers.customer_id,
+ customers.first_name,
+ customers.last_name,
+ customer_orders.first_order_date,
+ customer_orders.most_recent_order_date,
+ coalesce(customer_orders.number_of_orders, 0) as number_of_orders
+
+ from customers
+
+ left join customer_orders using (customer_id)
+
+)
+
+select * from final
+
+```
+
+
+
+4. Execute `dbt run`.
+
+This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies.
+
+### FAQs {#faq-2}
+
+
+
+
+
+### Next steps
+
+
+
+You can also explore:
+
+* The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL.
+* The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the select statements that are running and the python logging happening when dbt runs.
+
+## Add tests to your models
+
+
+
+## Document your models
+
+
+
+3. Run `dbt docs serve` command to launch the documentation in a local website.
+
+#### FAQs
+
+
+
+
+
+#### Next steps
+
+
+
+## Commit updated changes
+
+You need to commit the changes you made to the project so that the repository has your latest code.
+
+1. Add all your changes to git: `git add -A`
+2. Commit your changes: `git commit -m "Add customers model, tests, docs"`
+3. Push your changes to your repository: `git push`
+4. Navigate to your repository, and open a pull request to merge the code into your master branch.
+
+## Schedule a job
+
+We recommend using dbt Cloud as the easiest and most reliable way to [deploy jobs](/docs/deploy/deployments) and automate your dbt project in production.
+
+For more info on how to get started, refer to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs).
+
+
+
+For more information about using dbt Core to schedule a job, refer [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post.
diff --git a/website/docs/guides/microsoft-fabric-qs.md b/website/docs/guides/microsoft-fabric-qs.md
new file mode 100644
index 00000000000..c7c53a2aac7
--- /dev/null
+++ b/website/docs/guides/microsoft-fabric-qs.md
@@ -0,0 +1,314 @@
+---
+title: "Quickstart for dbt Cloud and Microsoft Fabric"
+id: "microsoft-fabric"
+level: 'Beginner'
+icon: 'fabric'
+hide_table_of_contents: true
+tags: ['dbt Cloud','Quickstart']
+recently_updated: true
+---
+## Introduction
+
+In this quickstart guide, you'll learn how to use dbt Cloud with Microsoft Fabric. It will show you how to:
+
+- Load the Jaffle Shop sample data (provided by dbt Labs) into your Microsoft Fabric warehouse.
+- Connect dbt Cloud to Microsoft Fabric.
+- Turn a sample query into a model in your dbt project. A model in dbt is a SELECT statement.
+- Add tests to your models.
+- Document your models.
+- Schedule a job to run.
+
+:::tip Public preview
+
+A public preview of Microsoft Fabric in dbt Cloud is now available!
+
+:::
+
+### Prerequisites
+- You have a [dbt Cloud](https://www.getdbt.com/signup/) account.
+- You have started the Microsoft Fabric (Preview) trial. For details, refer to [Microsoft Fabric (Preview) trial](https://learn.microsoft.com/en-us/fabric/get-started/fabric-trial) in the Microsoft docs.
+- As a Microsoft admin, you’ve enabled service principal authentication. For details, refer to [Enable service principal authentication](https://learn.microsoft.com/en-us/fabric/admin/metadata-scanning-enable-read-only-apis) in the Microsoft docs. dbt Cloud needs these authentication credentials to connect to Microsoft Fabric.
+
+### Related content
+- [dbt Courses](https://courses.getdbt.com/collections)
+- [About continuous integration jobs](/docs/deploy/continuous-integration)
+- [Deploy jobs](/docs/deploy/deploy-jobs)
+- [Job notifications](/docs/deploy/job-notifications)
+- [Source freshness](/docs/deploy/source-freshness)
+
+## Load data into your Microsoft Fabric warehouse
+
+1. Log in to your [Microsoft Fabric](http://app.fabric.microsoft.com) account.
+2. On the home page, select the **Synapse Data Warehouse** tile.
+
+
+
+3. From **Workspaces** on the left sidebar, navigate to your organization’s workspace. Or, you can create a new workspace; refer to [Create a workspace](https://learn.microsoft.com/en-us/fabric/get-started/create-workspaces) in the Microsoft docs for more details.
+4. Choose your warehouse from the table. Or, you can create a new warehouse; refer to [Create a warehouse](https://learn.microsoft.com/en-us/fabric/data-warehouse/tutorial-create-warehouse) in the Microsoft docs for more details.
+5. Open the SQL editor by selecting **New SQL query** from the top bar.
+6. Copy these statements into the SQL editor to load the Jaffle Shop example data:
+
+ ```sql
+ DROP TABLE dbo.customers;
+
+ CREATE TABLE dbo.customers
+ (
+ [ID] [int],
+ [FIRST_NAME] [varchar] (8000),
+ [LAST_NAME] [varchar] (8000)
+ );
+
+ COPY INTO [dbo].[customers]
+ FROM 'https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/jaffle_shop_customers.parquet'
+ WITH (
+ FILE_TYPE = 'PARQUET'
+ );
+
+ DROP TABLE dbo.orders;
+
+ CREATE TABLE dbo.orders
+ (
+ [ID] [int],
+ [USER_ID] [int],
+ -- [ORDER_DATE] [int],
+ [ORDER_DATE] [date],
+ [STATUS] [varchar] (8000)
+ );
+
+ COPY INTO [dbo].[orders]
+ FROM 'https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/jaffle_shop_orders.parquet'
+ WITH (
+ FILE_TYPE = 'PARQUET'
+ );
+
+ DROP TABLE dbo.payments;
+
+ CREATE TABLE dbo.payments
+ (
+ [ID] [int],
+ [ORDERID] [int],
+ [PAYMENTMETHOD] [varchar] (8000),
+ [STATUS] [varchar] (8000),
+ [AMOUNT] [int],
+ [CREATED] [date]
+ );
+
+ COPY INTO [dbo].[payments]
+ FROM 'https://dbtlabsynapsedatalake.blob.core.windows.net/dbt-quickstart-public/stripe_payments.parquet'
+ WITH (
+ FILE_TYPE = 'PARQUET'
+ );
+ ```
+
+
+
+## Connect dbt Cloud to Microsoft Fabric
+
+1. Create a new project in dbt Cloud. From **Account settings** (using the gear menu in the top right corner), click **+ New Project**.
+2. Enter a project name and click **Continue**.
+3. Choose **Fabric** as your connection and click **Next**.
+4. In the **Configure your environment** section, enter the **Settings** for your new project:
+5. Enter the **Development credentials** for your new project:
+ - **Authentication** — Choose **Service Principal** from the dropdown.
+ - **Tenant ID** — Use the service principal’s **Directory (tenant) id** as the value.
+ - **Client ID** — Use the service principal’s **application (client) ID id** as the value.
+ - **Client secret** — Use the service principal’s **client secret** (not the **client secret id**) as the value.
+6. Click **Test connection**. This verifies that dbt Cloud can access your Microsoft Fabric account.
+7. Click **Next** when the test succeeds. If it failed, you might need to check your Microsoft service principal.
+
+## Set up a dbt Cloud managed repository
+
+
+## Initialize your dbt project and start developing
+Now that you have a repository configured, you can initialize your project and start development in dbt Cloud:
+
+1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse.
+2. Above the file tree to the left, click **Initialize dbt project**. This builds out your folder structure with example models.
+3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` and click **Commit**. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code.
+4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now:
+ - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message.
+
+## Build your first model
+1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode.
+1. Click the **...** next to the `models` directory, then select **Create file**.
+1. Name the file `customers.sql`, then click **Create**.
+1. Copy the following query into the file and click **Save**.
+
+
+
+ ```sql
+ with customers as (
+
+ select
+ ID as customer_id,
+ FIRST_NAME as first_name,
+ LAST_NAME as last_name
+
+ from dbo.customers
+ ),
+
+ orders as (
+
+ select
+ ID as order_id,
+ USER_ID as customer_id,
+ ORDER_DATE as order_date,
+ STATUS as status
+
+ from dbo.orders
+ ),
+
+ customer_orders as (
+
+ select
+ customer_id,
+
+ min(order_date) as first_order_date,
+ max(order_date) as most_recent_order_date,
+ count(order_id) as number_of_orders
+
+ from orders
+
+ group by customer_id
+ ),
+
+ final as (
+
+ select
+ customers.customer_id,
+ customers.first_name,
+ customers.last_name,
+ customer_orders.first_order_date,
+ customer_orders.most_recent_order_date,
+ coalesce(customer_orders.number_of_orders, 0) as number_of_orders
+
+ from customers
+
+ left join customer_orders on customers.customer_id = customer_orders.customer_id
+ )
+
+ select * from final
+ ```
+
+
+1. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models.
+
+Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool.
+
+#### FAQs
+
+
+
+
+
+
+
+## Change the way your model is materialized
+
+
+
+## Delete the example models
+
+
+
+## Build models on top of other models
+
+
+
+1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query.
+2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query.
+
+
+
+ ```sql
+ select
+ ID as customer_id,
+ FIRST_NAME as first_name,
+ LAST_NAME as last_name
+
+ from dbo.customers
+ ```
+
+
+
+
+
+ ```sql
+ select
+ ID as order_id,
+ USER_ID as customer_id,
+ ORDER_DATE as order_date,
+ STATUS as status
+
+ from dbo.orders
+ ```
+
+
+
+3. Edit the SQL in your `models/customers.sql` file as follows:
+
+
+
+ ```sql
+ with customers as (
+
+ select * from {{ ref('stg_customers') }}
+
+ ),
+
+ orders as (
+
+ select * from {{ ref('stg_orders') }}
+
+ ),
+
+ customer_orders as (
+
+ select
+ customer_id,
+
+ min(order_date) as first_order_date,
+ max(order_date) as most_recent_order_date,
+ count(order_id) as number_of_orders
+
+ from orders
+
+ group by customer_id
+
+ ),
+
+ final as (
+
+ select
+ customers.customer_id,
+ customers.first_name,
+ customers.last_name,
+ customer_orders.first_order_date,
+ customer_orders.most_recent_order_date,
+ coalesce(customer_orders.number_of_orders, 0) as number_of_orders
+
+ from customers
+
+ left join customer_orders on customers.customer_id = customer_orders.customer_id
+
+ )
+
+ select * from final
+
+ ```
+
+
+
+4. Execute `dbt run`.
+
+ This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies.
+
+#### FAQs {#faq-2}
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md b/website/docs/guides/migrate-from-spark-to-databricks.md
similarity index 76%
rename from website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md
rename to website/docs/guides/migrate-from-spark-to-databricks.md
index f5549c58416..8fb02ae79d7 100644
--- a/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md
+++ b/website/docs/guides/migrate-from-spark-to-databricks.md
@@ -1,18 +1,34 @@
---
-title: "Migrating from dbt-spark to dbt-databricks"
-id: "migrating-from-spark-to-databricks"
+title: "Migrate from dbt-spark to dbt-databricks"
+id: "migrate-from-spark-to-databricks"
+description: Learn how to migrate from dbt-spark to dbt-databricks.
+displayText: Migrate from Spark to Databricks
+hoverSnippet: Learn how to migrate from dbt-spark to dbt-databricks.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Migration', 'dbt Core','dbt Cloud']
+level: 'Intermediate'
+recently_updated: true
---
-You can [migrate your projects](#migrate-your-dbt-projects) from using the `dbt-spark` adapter to using the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks). In collaboration with dbt Labs, Databricks built this adapter using dbt-spark as the foundation and added some critical improvements. With it, you get an easier set up — requiring only three inputs for authentication — and more features such as support for [Unity Catalog](https://www.databricks.com/product/unity-catalog).
+## Introduction
-## Simpler authentication
+You can migrate your projects from using the `dbt-spark` adapter to using the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks). In collaboration with dbt Labs, Databricks built this adapter using dbt-spark as the foundation and added some critical improvements. With it, you get an easier set up — requiring only three inputs for authentication — and more features such as support for [Unity Catalog](https://www.databricks.com/product/unity-catalog).
+
+### Prerequisites
+
+- Your project must be compatible with dbt 1.0 or greater. Refer to [Upgrading to v1.0](/docs/dbt-versions/core-upgrade/upgrading-to-v1.0) for details. For the latest version of dbt, refer to [Upgrading to v1.7](/docs/dbt-versions/core-upgrade/upgrading-to-v1.7).
+- For dbt Cloud, you need administrative (admin) privileges to migrate dbt projects.
+
+### Simpler authentication
Previously, you had to provide a `cluster` or `endpoint` ID which was hard to parse from the `http_path` that you were given. Now, it doesn't matter if you're using a cluster or an SQL endpoint because the [dbt-databricks setup](/docs/core/connect-data-platform/databricks-setup) requires the _same_ inputs for both. All you need to provide is:
- hostname of the Databricks workspace
- HTTP path of the Databricks SQL warehouse or cluster
- appropriate credentials
-## Better defaults
+### Better defaults
The `dbt-databricks` adapter provides better defaults than `dbt-spark` does. The defaults help optimize your workflow so you can get the fast performance and cost-effectiveness of Databricks. They are:
@@ -24,24 +40,14 @@ With dbt-spark, however, the default for `incremental_strategy` is `append`. If
For more information on defaults, see [Caveats](/docs/core/connect-data-platform/databricks-setup#caveats).
-## Pure Python
+### Pure Python
If you use dbt Core, you no longer have to download an independent driver to interact with Databricks. The connection information is all embedded in a pure-Python library called `databricks-sql-connector`.
-## Migrate your dbt projects
-
-In both dbt Core and dbt Cloud, you can migrate your projects to the Databricks-specific adapter from the generic Apache Spark adapter.
-
-### Prerequisites
-
-- Your project must be compatible with dbt 1.0 or greater. Refer to [Upgrading to v1.0](/guides/migration/versions/upgrading-to-v1.0) for details. For the latest version of dbt, refer to [Upgrading to v1.3](/guides/migration/versions/upgrading-to-v1.3).
-- For dbt Cloud, you need administrative (admin) privileges to migrate dbt projects.
-
-
-
+## Migrate your dbt projects in dbt Cloud
-
+You can migrate your projects to the Databricks-specific adapter from the generic Apache Spark adapter. If you're using dbt Core, then skip to Step 4.
The migration to the `dbt-databricks` adapter from `dbt-spark` shouldn't cause any downtime for production jobs. dbt Labs recommends that you schedule the connection change when usage of the IDE is light to avoid disrupting your team.
@@ -60,7 +66,7 @@ To update your Databricks connection in dbt Cloud:
Everyone in your organization who uses dbt Cloud must refresh the IDE before starting work again. It should refresh in less than a minute.
-#### About your credentials
+## Configure your credentials
When you update the Databricks connection in dbt Cloud, your team will not lose their credentials. This makes migrating easier since it only requires you to delete the Databricks connection and re-add the cluster or endpoint information.
@@ -70,9 +76,7 @@ These credentials will not get lost when there's a successful connection to Data
- The personal access tokens your team added in their dbt Cloud profile so they can develop in the IDE for a given project.
- The access token you added for each deployment environment so dbt Cloud can connect to Databricks during production jobs.
-
-
-
+## Migrate dbt projects in dbt Core
To migrate your dbt Core projects to the `dbt-databricks` adapter from `dbt-spark`, you:
1. Install the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks) in your environment
@@ -80,13 +84,8 @@ To migrate your dbt Core projects to the `dbt-databricks` adapter from `dbt-spar
Anyone who's using your project must also make these changes in their environment.
-
-
-
-
-
-### Examples
+## Try these examples
You can use the following examples of the `profiles.yml` file to see the authentication setup with `dbt-spark` compared to the simpler setup with `dbt-databricks` when connecting to an SQL endpoint. A cluster example would look similar.
diff --git a/website/docs/guides/migrate-from-stored-procedures.md b/website/docs/guides/migrate-from-stored-procedures.md
new file mode 100644
index 00000000000..c894bce9873
--- /dev/null
+++ b/website/docs/guides/migrate-from-stored-procedures.md
@@ -0,0 +1,377 @@
+---
+title: Migrate from DDL, DML, and stored procedures
+id: migrate-from-stored-procedures
+description: Learn how to transform from a historical codebase of mixed DDL and DML statements to dbt models, including tips and patterns for the shift from a procedural to a declarative approach in defining datasets.
+displayText: Migrate from DDL, DML, and stored procedures
+hoverSnippet: Learn how to transform from a historical codebase of mixed DDL and DML statements to dbt models
+# time_to_complete: '30 minutes' commenting out until we test
+platform: 'dbt-core'
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Migration', 'dbt Core']
+level: 'Beginner'
+recently_updated: true
+---
+
+## Introduction
+
+One of the more common situations that new dbt adopters encounter is a historical codebase of transformations written as a hodgepodge of DDL and DML statements, or stored procedures. Going from DML statements to dbt models is often a challenging hump for new users to get over, because the process involves a significant paradigm shift between a procedural flow of building a dataset (e.g. a series of DDL and DML statements) to a declarative approach to defining a dataset (e.g. how dbt uses SELECT statements to express data models). This guide aims to provide tips, tricks, and common patterns for converting DML statements to dbt models.
+
+### Preparing to migrate
+
+Before getting into the meat of conversion, it’s worth noting that DML statements will not always illustrate a comprehensive set of columns and column types that an original table might contain. Without knowing the DDL to create the table, it’s impossible to know precisely if your conversion effort is apples-to-apples, but you can generally get close.
+
+If your supports `SHOW CREATE TABLE`, that can be a quick way to get a comprehensive set of columns you’ll want to recreate. If you don’t have the DDL, but are working on a substantial stored procedure, one approach that can work is to pull column lists out of any DML statements that modify the table, and build up a full set of the columns that appear.
+
+As for ensuring that you have the right column types, since models materialized by dbt generally use `CREATE TABLE AS SELECT` or `CREATE VIEW AS SELECT` as the driver for object creation, tables can end up with unintended column types if the queries aren’t explicit. For example, if you care about `INT` versus `DECIMAL` versus `NUMERIC`, it’s generally going to be best to be explicit. The good news is that this is easy with dbt: you just cast the column to the type you intend.
+
+We also generally recommend that column renaming and type casting happen as close to the source tables as possible, typically in a layer of staging transformations, which helps ensure that future dbt modelers will know where to look for those transformations! See [How we structure our dbt projects](/best-practices/how-we-structure/1-guide-overview) for more guidance on overall project structure.
+
+### Operations we need to map
+
+There are four primary DML statements that you are likely to have to convert to dbt operations while migrating a procedure:
+
+- `INSERT`
+- `UPDATE`
+- `DELETE`
+- `MERGE`
+
+Each of these can be addressed using various techniques in dbt. Handling `MERGE`s is a bit more involved than the rest, but can be handled effectively via dbt. The first three, however, are fairly simple to convert.
+
+## Map INSERTs
+
+An `INSERT` statement is functionally the same as using dbt to `SELECT` from an existing source or other dbt model. If you are faced with an `INSERT`-`SELECT` statement, the easiest way to convert the statement is to just create a new dbt model, and pull the `SELECT` portion of the `INSERT` statement out of the procedure and into the model. That’s basically it!
+
+To really break it down, let’s consider a simple example:
+
+```sql
+INSERT INTO returned_orders (order_id, order_date, total_return)
+
+SELECT order_id, order_date, total FROM orders WHERE type = 'return'
+```
+
+Converting this with a first pass to a [dbt model](/guides/bigquery?step=8) (in a file called returned_orders.sql) might look something like:
+
+```sql
+SELECT
+ order_id as order_id,
+ order_date as order_date,
+ total as total_return
+
+FROM {{ ref('orders') }}
+
+WHERE type = 'return'
+```
+
+Functionally, this would create a model (which could be materialized as a table or view depending on needs) called `returned_orders` that contains three columns: `order_id`, `order_date`, `total_return`) predicated on the type column. It achieves the same end as the `INSERT`, just in a declarative fashion, using dbt.
+
+### **A note on `FROM` clauses**
+
+In dbt, using a hard-coded table or view name in a `FROM` clause is one of the most serious mistakes new users make. dbt uses the ref and source macros to discover the ordering that transformations need to execute in, and if you don’t use them, you’ll be unable to benefit from dbt’s built-in lineage generation and pipeline execution. In the sample code throughout the remainder of this article, we’ll use ref statements in the dbt-converted versions of SQL statements, but it is an exercise for the reader to ensure that those models exist in their dbt projects.
+
+### **Sequential `INSERT`s to an existing table can be `UNION ALL`’ed together**
+
+Since dbt models effectively perform a single `CREATE TABLE AS SELECT` (or if you break it down into steps, `CREATE`, then an `INSERT`), you may run into complexities if there are multiple `INSERT` statements in your transformation that all insert data into the same table. Fortunately, this is a simple thing to handle in dbt. Effectively, the logic is performing a `UNION ALL` between the `INSERT` queries. If I have a transformation flow that looks something like (ignore the contrived nature of the scenario):
+
+```sql
+CREATE TABLE all_customers
+
+INSERT INTO all_customers SELECT * FROM us_customers
+
+INSERT INTO all_customers SELECT * FROM eu_customers
+```
+
+The dbt-ified version of this would end up looking something like:
+
+```sql
+SELECT * FROM {{ ref('us_customers') }}
+
+UNION ALL
+
+SELECT * FROM {{ ref('eu_customers') }}
+```
+
+The logic is functionally equivalent. So if there’s another statement that `INSERT`s into a model that I’ve already created, I can just add that logic into a second `SELECT` statement that is just `UNION ALL`'ed with the first. Easy!
+
+## Map UPDATEs
+
+`UPDATE`s start to increase the complexity of your transformations, but fortunately, they’re pretty darn simple to migrate, as well. The thought process that you go through when translating an `UPDATE` is quite similar to how an `INSERT` works, but the logic for the `SELECT` list in the dbt model is primarily sourced from the content in the `SET` section of the `UPDATE` statement. Let’s look at a simple example:
+
+```sql
+UPDATE orders
+
+SET type = 'return'
+
+WHERE total < 0
+```
+
+The way to look at this is similar to an `INSERT`-`SELECT` statement. The table being updated is the model you want to modify, and since this is an `UPDATE`, that model has likely already been created, and you can either:
+
+- add to it with subsequent transformations
+- create an intermediate model that builds off of the original model – perhaps naming it something like `int_[entity]_[verb].sql`.
+
+The `SELECT` list should contain all of the columns for the table, but for the specific columns being updated by the DML, you’ll use the computation on the right side of the equals sign as the `SELECT`ed value. Then, you can use the target column name on the left of the equals sign as the column alias.
+
+If I were building an intermediate transformation from the above query would translate to something along the lines of:
+
+```sql
+SELECT
+ CASE
+ WHEN total < 0 THEN 'return'
+ ELSE type
+ END AS type,
+
+ order_id,
+ order_date
+
+FROM {{ ref('stg_orders') }}
+```
+
+Since the `UPDATE` statement doesn’t modify every value of the type column, we use a `CASE` statement to apply the contents’ `WHERE` clause. We still want to select all of the columns that should end up in the target table. If we left one of the columns out, it wouldn’t be passed through to the target table at all due to dbt’s declarative approach.
+
+Sometimes, you may not be sure what all the columns are in a table, or in the situation as above, you’re only modifying a small number of columns relative to the total number of columns in the table. It can be cumbersome to list out every column in the table, but fortunately dbt contains some useful utility macros that can help list out the full column list of a table.
+
+Another way I could have written the model a bit more dynamically might be:
+
+```sql
+SELECT
+ {{ dbt_utils.star(from=ref('stg_orders'), except=['type']) }},
+ CASE
+ WHEN total < 0 THEN 'return'
+ ELSE type
+ END AS type,
+
+FROM {{ ref('stg_orders') }}
+```
+
+The `dbt_utils.star()` macro will print out the full list of columns in the table, but skip the ones I’ve listed in the except list, which allows me to perform the same logic while writing fewer lines of code. This is a simple example of using dbt macros to simplify and shorten your code, and dbt can get a lot more sophisticated as you learn more techniques. Read more about the [dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and the [star macro](https://github.com/dbt-labs/dbt-utils/tree/0.8.6/#star-source).
+
+## Map DELETEs
+
+One of the biggest differences between a procedural transformation and how dbt models data is that dbt, in general, will never destroy data. While there are ways to execute hard `DELETE`s in dbt that are outside of the scope of this article, the general best practice for handling deleted data is to just use soft deletes, and filter out soft-deleted data in a final transformation.
+
+Let’s consider a simple example query:
+
+```sql
+DELETE FROM stg_orders WHERE order_status IS NULL
+```
+
+In a dbt model, you’ll need to first identify the records that should be deleted and then filter them out. There are really two primary ways you might translate this query:
+
+```sql
+SELECT * FROM {{ ref('stg_orders') }} WHERE order_status IS NOT NULL
+```
+
+This first approach just inverts the logic of the DELETE to describe the set of records that should remain, instead of the set of records that should be removed. This ties back to the way dbt declaratively describes datasets. You reference the data that should be in a dataset, and the table or view gets created with that set of data.
+
+Another way you could achieve this is by marking the deleted records, and then filtering them out. For example:
+
+```sql
+WITH
+
+soft_deletes AS (
+
+ SELECT
+ *,
+ CASE
+ WHEN order_status IS NULL THEN true
+ ELSE false
+ END AS to_delete
+
+ FROM {{ ref('stg_orders') }}
+
+)
+
+SELECT * FROM soft_deletes WHERE to_delete = false
+```
+
+This approach flags all of the deleted records, and the final `SELECT` filters out any deleted data, so the resulting table contains only the remaining records. It’s a lot more verbose than just inverting the `DELETE` logic, but for complex `DELETE` logic, this ends up being a very effective way of performing the `DELETE` that retains historical context.
+
+It’s worth calling out that while this doesn’t enable a hard delete, hard deletes can be executed a number of ways, the most common being to execute a dbt [macros](/docs/build/jinja-macros) via as a [run-operation](https://docs.getdbt.com/reference/commands/run-operation), or by using a [post-hook](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook/) to perform a `DELETE` statement after the records to-be-deleted have been marked. These are advanced approaches outside the scope of this guide.
+
+
+## Map MERGEs
+dbt has a concept called [materialization](/docs/build/materializations), which determines how a model is physically or logically represented in the warehouse. `INSERT`s, `UPDATE`s, and `DELETE`s will typically be accomplished using table or view materializations. For incremental workloads accomplished via commands like `MERGE` or `UPSERT`, dbt has a particular materialization called [incremental](/docs/build/incremental-models). The incremental materialization is specifically used to handle incremental loads and updates to a table without recreating the entire table from scratch on every run.
+
+### Step 1: Map the MERGE like an INSERT/UPDATE to start
+
+Before we get into the exact details of how to implement an incremental materialization, let’s talk about logic conversion. Extracting the logic of the `MERGE` and handling it as you would an `INSERT` or an `UPDATE` is the easiest way to get started migrating a `MERGE` command. .
+
+To see how the logic conversion works, we’ll start with an example `MERGE`. In this scenario, imagine a ride sharing app where rides are loaded into a details table daily, and tips may be updated at some later date, and need to be kept up-to-date:
+
+```sql
+MERGE INTO ride_details USING (
+ SELECT
+ ride_id,
+ subtotal,
+ tip
+
+ FROM rides_to_load AS rtl
+
+ ON ride_details.ride_id = rtl.ride_id
+
+ WHEN MATCHED THEN UPDATE
+
+ SET ride_details.tip = rtl.tip
+
+ WHEN NOT MATCHED THEN INSERT (ride_id, subtotal, tip)
+ VALUES (rtl.ride_id, rtl.subtotal, NVL(rtl.tip, 0, rtl.tip)
+);
+```
+
+The content of the `USING` clause is a useful piece of code because that can easily be placed in a CTE as a starting point for handling the match statement. I find that the easiest way to break this apart is to treat each match statement as a separate CTE that builds on the previous match statements.
+
+We can ignore the `ON` clause for now, as that will only come into play once we get to a point where we’re ready to turn this into an incremental.
+
+As with `UPDATE`s and `INSERT`s, you can use the `SELECT` list and aliases to name columns appropriately for the target table, and `UNION` together `INSERT` statements (taking care to use `UNION`, rather than `UNION ALL` to avoid duplicates).
+
+The `MERGE` would end up translating to something like this:
+
+```sql
+WITH
+
+using_clause AS (
+
+ SELECT
+ ride_id,
+ subtotal,
+ tip
+
+ FROM {{ ref('rides_to_load') }}
+
+),
+
+updates AS (
+
+ SELECT
+ ride_id,
+ subtotal,
+ tip
+
+ FROM using_clause
+
+),
+
+inserts AS (
+
+ SELECT
+ ride_id,
+ subtotal,
+ NVL(tip, 0, tip)
+
+ FROM using_clause
+
+)
+
+SELECT *
+
+FROM updates
+
+UNION inserts
+```
+
+To be clear, this transformation isn’t complete. The logic here is similar to the `MERGE`, but will not actually do the same thing, since the updates and inserts CTEs are both selecting from the same source query. We’ll need to ensure we grab the separate sets of data as we transition to the incremental materialization.
+
+One important caveat is that dbt does not natively support `DELETE` as a `MATCH` action. If you have a line in your `MERGE` statement that uses `WHEN MATCHED THEN DELETE`, you’ll want to treat it like an update and add a soft-delete flag, which is then filtered out in a follow-on transformation.
+
+### Step 2: Convert to incremental materialization
+
+As mentioned above, incremental materializations are a little special in that when the target table does not exist, the materialization functions in nearly the same way as a standard table materialization, and executes a `CREATE TABLE AS SELECT` statement. If the target table does exist, however, the materialization instead executes a `MERGE` statement.
+
+Since a `MERGE` requires a `JOIN` condition between the `USING` clause and the target table, we need a way to specify how dbt determines whether or not a record triggers a match or not. That particular piece of information is specified in the dbt model configuration.
+
+We can add the following `config()` block to the top of our model to specify how it should build incrementally:
+
+```sql
+{{
+ config(
+ materialized='incremental',
+ unique_key='ride_id',
+ incremental_strategy='merge'
+ )
+}}
+```
+
+The three configuration fields in this example are the most important ones.
+
+- Setting `materialized='incremental'` tells dbt to apply UPSERT logic to the target table.
+- The `unique_key` should be a primary key of the target table. This is used to match records with the existing table.
+- `incremental_strategy` here is set to MERGE any existing rows in the target table with a value for the `unique_key` which matches the incoming batch of data. There are [various incremental strategies](/docs/build/incremental-models#about-incremental_strategy) for different situations and warehouses.
+
+The bulk of the work in converting a model to an incremental materialization comes in determining how the logic should change for incremental loads versus full backfills or initial loads. dbt offers a special macro, `is_incremental()`, which evaluates false for initial loads or for backfills (called full refreshes in dbt parlance), but true for incremental loads.
+
+This macro can be used to augment the model code to adjust how data is loaded for subsequent loads. How that logic should be added will depend a little bit on how data is received. Some common ways might be:
+
+1. The source table is truncated ahead of incremental loads, and only contains the data to be loaded in that increment.
+2. The source table contains all historical data, and there is a load timestamp column that identifies new data to be loaded.
+
+In the first case, the work is essentially done already. Since the source table always contains only the new data to be loaded, the query doesn’t have to change for incremental loads. The second case, however, requires the use of the `is_incremental()` macro to correctly handle the logic.
+
+Taking the converted `MERGE` statement that we’d put together previously, we’d augment it to add this additional logic:
+
+```sql
+WITH
+
+using_clause AS (
+
+ SELECT
+ ride_id,
+ subtotal,
+ tip,
+ max(load_timestamp) as load_timestamp
+
+ FROM {{ ref('rides_to_load') }}
+
+
+ {% if is_incremental() %}
+
+ WHERE load_timestamp > (SELECT max(load_timestamp) FROM {{ this }})
+
+ {% endif %}
+
+),
+
+updates AS (
+
+ SELECT
+ ride_id,
+ subtotal,
+ tip,
+ load_timestamp
+
+ FROM using_clause
+
+ {% if is_incremental() %}
+
+ WHERE ride_id IN (SELECT ride_id FROM {{ this }})
+
+ {% endif %}
+
+),
+
+inserts AS (
+
+ SELECT
+ ride_id,
+ subtotal,
+ NVL(tip, 0, tip),
+ load_timestamp
+
+ FROM using_clause
+
+ WHERE ride_id NOT IN (SELECT ride_id FROM updates)
+
+)
+
+SELECT * FROM updates UNION inserts
+```
+
+There are a couple important concepts to understand here:
+
+1. The code in the `is_incremental()` conditional block only executes for incremental executions of this model code. If the target table doesn’t exist, or if the `--full-refresh` option is used, that code will not execute.
+2. `{{ this }}` is a special keyword in dbt that when used in a Jinja block, self-refers to the model for which the code is executing. So if you have a model in a file called `my_incremental_model.sql`, `{{ this }}` will refer to `my_incremental_model` (fully qualified with database and schema name if necessary). By using that keyword, we can leverage the current state of the target table to inform the source query.
+
+
+## Migrate Stores procedures
+
+The techniques shared above are useful ways to get started converting the individual DML statements that are often found in stored procedures. Using these types of patterns, legacy procedural code can be rapidly transitioned to dbt models that are much more readable, maintainable, and benefit from software engineering best practices like DRY principles . Additionally, once transformations are rewritten as dbt models, it becomes much easier to test the transformations to ensure that the data being used downstream is high-quality and trustworthy.
diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md
deleted file mode 100644
index aae8b373b2c..00000000000
--- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures.md
+++ /dev/null
@@ -1,27 +0,0 @@
----
-title: Migrating from DDL, DML, and stored procedures
-id: 1-migrating-from-stored-procedures
----
-
-One of the more common situations that new dbt adopters encounter is a historical codebase of transformations written as a hodgepodge of DDL and DML statements, or stored procedures. Going from DML statements to dbt models is often a challenging hump for new users to get over, because the process involves a significant paradigm shift between a procedural flow of building a dataset (e.g. a series of DDL and DML statements) to a declarative approach to defining a dataset (e.g. how dbt uses SELECT statements to express data models). This guide aims to provide tips, tricks, and common patterns for converting DML statements to dbt models.
-
-## Preparing to migrate
-
-Before getting into the meat of conversion, it’s worth noting that DML statements will not always illustrate a comprehensive set of columns and column types that an original table might contain. Without knowing the DDL to create the table, it’s impossible to know precisely if your conversion effort is apples-to-apples, but you can generally get close.
-
-If your supports `SHOW CREATE TABLE`, that can be a quick way to get a comprehensive set of columns you’ll want to recreate. If you don’t have the DDL, but are working on a substantial stored procedure, one approach that can work is to pull column lists out of any DML statements that modify the table, and build up a full set of the columns that appear.
-
-As for ensuring that you have the right column types, since models materialized by dbt generally use `CREATE TABLE AS SELECT` or `CREATE VIEW AS SELECT` as the driver for object creation, tables can end up with unintended column types if the queries aren’t explicit. For example, if you care about `INT` versus `DECIMAL` versus `NUMERIC`, it’s generally going to be best to be explicit. The good news is that this is easy with dbt: you just cast the column to the type you intend.
-
-We also generally recommend that column renaming and type casting happen as close to the source tables as possible, typically in a layer of staging transformations, which helps ensure that future dbt modelers will know where to look for those transformations! See [How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) for more guidance on overall project structure.
-
-### Operations we need to map
-
-There are four primary DML statements that you are likely to have to convert to dbt operations while migrating a procedure:
-
-- `INSERT`
-- `UPDATE`
-- `DELETE`
-- `MERGE`
-
-Each of these can be addressed using various techniques in dbt. Handling `MERGE`s is a bit more involved than the rest, but can be handled effectively via dbt. The first three, however, are fairly simple to convert.
diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md
deleted file mode 100644
index d8f31a0f14a..00000000000
--- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md
+++ /dev/null
@@ -1,57 +0,0 @@
----
-title: Inserts
-id: 2-inserts
----
-
-An `INSERT` statement is functionally the same as using dbt to `SELECT` from an existing source or other dbt model. If you are faced with an `INSERT`-`SELECT` statement, the easiest way to convert the statement is to just create a new dbt model, and pull the `SELECT` portion of the `INSERT` statement out of the procedure and into the model. That’s basically it!
-
-To really break it down, let’s consider a simple example:
-
-```sql
-INSERT INTO returned_orders (order_id, order_date, total_return)
-
-SELECT order_id, order_date, total FROM orders WHERE type = 'return'
-```
-
-Converting this with a first pass to a [dbt model](/quickstarts/bigquery?step=8) (in a file called returned_orders.sql) might look something like:
-
-```sql
-SELECT
- order_id as order_id,
- order_date as order_date,
- total as total_return
-
-FROM {{ ref('orders') }}
-
-WHERE type = 'return'
-```
-
-Functionally, this would create a model (which could be materialized as a table or view depending on needs) called `returned_orders` that contains three columns: `order_id`, `order_date`, `total_return`) predicated on the type column. It achieves the same end as the `INSERT`, just in a declarative fashion, using dbt.
-
-## **A note on `FROM` clauses**
-
-In dbt, using a hard-coded table or view name in a `FROM` clause is one of the most serious mistakes new users make. dbt uses the ref and source macros to discover the ordering that transformations need to execute in, and if you don’t use them, you’ll be unable to benefit from dbt’s built-in lineage generation and pipeline execution. In the sample code throughout the remainder of this article, we’ll use ref statements in the dbt-converted versions of SQL statements, but it is an exercise for the reader to ensure that those models exist in their dbt projects.
-
-## **Sequential `INSERT`s to an existing table can be `UNION ALL`’ed together**
-
-Since dbt models effectively perform a single `CREATE TABLE AS SELECT` (or if you break it down into steps, `CREATE`, then an `INSERT`), you may run into complexities if there are multiple `INSERT` statements in your transformation that all insert data into the same table. Fortunately, this is a simple thing to handle in dbt. Effectively, the logic is performing a `UNION ALL` between the `INSERT` queries. If I have a transformation flow that looks something like (ignore the contrived nature of the scenario):
-
-```sql
-CREATE TABLE all_customers
-
-INSERT INTO all_customers SELECT * FROM us_customers
-
-INSERT INTO all_customers SELECT * FROM eu_customers
-```
-
-The dbt-ified version of this would end up looking something like:
-
-```sql
-SELECT * FROM {{ ref('us_customers') }}
-
-UNION ALL
-
-SELECT * FROM {{ ref('eu_customers') }}
-```
-
-The logic is functionally equivalent. So if there’s another statement that `INSERT`s into a model that I’ve already created, I can just add that logic into a second `SELECT` statement that is just `UNION ALL`'ed with the first. Easy!
diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/3-mapping-updates.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/3-mapping-updates.md
deleted file mode 100644
index b6f0874fb6b..00000000000
--- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/3-mapping-updates.md
+++ /dev/null
@@ -1,55 +0,0 @@
----
-title: Updates
-id: 3-updates
----
-
-`UPDATE`s start to increase the complexity of your transformations, but fortunately, they’re pretty darn simple to migrate, as well. The thought process that you go through when translating an `UPDATE` is quite similar to how an `INSERT` works, but the logic for the `SELECT` list in the dbt model is primarily sourced from the content in the `SET` section of the `UPDATE` statement. Let’s look at a simple example:
-
-```sql
-UPDATE orders
-
-SET type = 'return'
-
-WHERE total < 0
-```
-
-The way to look at this is similar to an `INSERT`-`SELECT` statement. The table being updated is the model you want to modify, and since this is an `UPDATE`, that model has likely already been created, and you can either:
-
-- add to it with subsequent transformations
-- create an intermediate model that builds off of the original model – perhaps naming it something like `int_[entity]_[verb].sql`.
-
-The `SELECT` list should contain all of the columns for the table, but for the specific columns being updated by the DML, you’ll use the computation on the right side of the equals sign as the `SELECT`ed value. Then, you can use the target column name on the left of the equals sign as the column alias.
-
-If I were building an intermediate transformation from the above query would translate to something along the lines of:
-
-```sql
-SELECT
- CASE
- WHEN total < 0 THEN 'return'
- ELSE type
- END AS type,
-
- order_id,
- order_date
-
-FROM {{ ref('stg_orders') }}
-```
-
-Since the `UPDATE` statement doesn’t modify every value of the type column, we use a `CASE` statement to apply the contents’ `WHERE` clause. We still want to select all of the columns that should end up in the target table. If we left one of the columns out, it wouldn’t be passed through to the target table at all due to dbt’s declarative approach.
-
-Sometimes, you may not be sure what all the columns are in a table, or in the situation as above, you’re only modifying a small number of columns relative to the total number of columns in the table. It can be cumbersome to list out every column in the table, but fortunately dbt contains some useful utility macros that can help list out the full column list of a table.
-
-Another way I could have written the model a bit more dynamically might be:
-
-```sql
-SELECT
- {{ dbt_utils.star(from=ref('stg_orders'), except=['type']) }},
- CASE
- WHEN total < 0 THEN 'return'
- ELSE type
- END AS type,
-
-FROM {{ ref('stg_orders') }}
-```
-
-The `dbt_utils.star()` macro will print out the full list of columns in the table, but skip the ones I’ve listed in the except list, which allows me to perform the same logic while writing fewer lines of code. This is a simple example of using dbt macros to simplify and shorten your code, and dbt can get a lot more sophisticated as you learn more techniques. Read more about the [dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and the [star macro](https://github.com/dbt-labs/dbt-utils/tree/0.8.6/#star-source).
diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md
deleted file mode 100644
index 1a8c6435d42..00000000000
--- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md
+++ /dev/null
@@ -1,45 +0,0 @@
----
-title: Deletes
-id: 4-deletes
----
-
-One of the biggest differences between a procedural transformation and how dbt models data is that dbt, in general, will never destroy data. While there are ways to execute hard `DELETE`s in dbt that are outside of the scope of this article, the general best practice for handling deleted data is to just use soft deletes, and filter out soft-deleted data in a final transformation.
-
-Let’s consider a simple example query:
-
-```sql
-DELETE FROM stg_orders WHERE order_status IS NULL
-```
-
-In a dbt model, you’ll need to first identify the records that should be deleted and then filter them out. There are really two primary ways you might translate this query:
-
-```sql
-SELECT * FROM {{ ref('stg_orders') }} WHERE order_status IS NOT NULL
-```
-
-This first approach just inverts the logic of the DELETE to describe the set of records that should remain, instead of the set of records that should be removed. This ties back to the way dbt declaratively describes datasets. You reference the data that should be in a dataset, and the table or view gets created with that set of data.
-
-Another way you could achieve this is by marking the deleted records, and then filtering them out. For example:
-
-```sql
-WITH
-
-soft_deletes AS (
-
- SELECT
- *,
- CASE
- WHEN order_status IS NULL THEN true
- ELSE false
- END AS to_delete
-
- FROM {{ ref('stg_orders') }}
-
-)
-
-SELECT * FROM soft_deletes WHERE to_delete = false
-```
-
-This approach flags all of the deleted records, and the final `SELECT` filters out any deleted data, so the resulting table contains only the remaining records. It’s a lot more verbose than just inverting the `DELETE` logic, but for complex `DELETE` logic, this ends up being a very effective way of performing the `DELETE` that retains historical context.
-
-It’s worth calling out that while this doesn’t enable a hard delete, hard deletes can be executed a number of ways, the most common being to execute a dbt [macros](/docs/build/jinja-macros) via as a [run-operation](https://docs.getdbt.com/reference/commands/run-operation), or by using a [post-hook](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook/) to perform a `DELETE` statement after the records to-be-deleted have been marked. These are advanced approaches outside the scope of this guide.
diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md
deleted file mode 100644
index d059ab9a258..00000000000
--- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md
+++ /dev/null
@@ -1,184 +0,0 @@
----
-title: Merges
-id: 5-merges
----
-
-dbt has a concept called [materialization](/docs/build/materializations), which determines how a model is physically or logically represented in the warehouse. `INSERT`s, `UPDATE`s, and `DELETE`s will typically be accomplished using table or view materializations. For incremental workloads accomplished via commands like `MERGE` or `UPSERT`, dbt has a particular materialization called [incremental](/docs/build/incremental-models). The incremental materialization is specifically used to handle incremental loads and updates to a table without recreating the entire table from scratch on every run.
-
-## Step 1: Map the MERGE like an INSERT/UPDATE to start
-
-Before we get into the exact details of how to implement an incremental materialization, let’s talk about logic conversion. Extracting the logic of the `MERGE` and handling it as you would an `INSERT` or an `UPDATE` is the easiest way to get started migrating a `MERGE` command. .
-
-To see how the logic conversion works, we’ll start with an example `MERGE`. In this scenario, imagine a ride sharing app where rides are loaded into a details table daily, and tips may be updated at some later date, and need to be kept up-to-date:
-
-```sql
-MERGE INTO ride_details USING (
- SELECT
- ride_id,
- subtotal,
- tip
-
- FROM rides_to_load AS rtl
-
- ON ride_details.ride_id = rtl.ride_id
-
- WHEN MATCHED THEN UPDATE
-
- SET ride_details.tip = rtl.tip
-
- WHEN NOT MATCHED THEN INSERT (ride_id, subtotal, tip)
- VALUES (rtl.ride_id, rtl.subtotal, NVL(rtl.tip, 0, rtl.tip)
-);
-```
-
-The content of the `USING` clause is a useful piece of code because that can easily be placed in a CTE as a starting point for handling the match statement. I find that the easiest way to break this apart is to treat each match statement as a separate CTE that builds on the previous match statements.
-
-We can ignore the `ON` clause for now, as that will only come into play once we get to a point where we’re ready to turn this into an incremental.
-
-As with `UPDATE`s and `INSERT`s, you can use the `SELECT` list and aliases to name columns appropriately for the target table, and `UNION` together `INSERT` statements (taking care to use `UNION`, rather than `UNION ALL` to avoid duplicates).
-
-The `MERGE` would end up translating to something like this:
-
-```sql
-WITH
-
-using_clause AS (
-
- SELECT
- ride_id,
- subtotal,
- tip
-
- FROM {{ ref('rides_to_load') }}
-
-),
-
-updates AS (
-
- SELECT
- ride_id,
- subtotal,
- tip
-
- FROM using_clause
-
-),
-
-inserts AS (
-
- SELECT
- ride_id,
- subtotal,
- NVL(tip, 0, tip)
-
- FROM using_clause
-
-)
-
-SELECT *
-
-FROM updates
-
-UNION inserts
-```
-
-To be clear, this transformation isn’t complete. The logic here is similar to the `MERGE`, but will not actually do the same thing, since the updates and inserts CTEs are both selecting from the same source query. We’ll need to ensure we grab the separate sets of data as we transition to the incremental materialization.
-
-One important caveat is that dbt does not natively support `DELETE` as a `MATCH` action. If you have a line in your `MERGE` statement that uses `WHEN MATCHED THEN DELETE`, you’ll want to treat it like an update and add a soft-delete flag, which is then filtered out in a follow-on transformation.
-
-### Step 2: Convert to incremental materialization
-
-As mentioned above, incremental materializations are a little special in that when the target table does not exist, the materialization functions in nearly the same way as a standard table materialization, and executes a `CREATE TABLE AS SELECT` statement. If the target table does exist, however, the materialization instead executes a `MERGE` statement.
-
-Since a `MERGE` requires a `JOIN` condition between the `USING` clause and the target table, we need a way to specify how dbt determines whether or not a record triggers a match or not. That particular piece of information is specified in the dbt model configuration.
-
-We can add the following `config()` block to the top of our model to specify how it should build incrementally:
-
-```sql
-{{
- config(
- materialized='incremental',
- unique_key='ride_id',
- incremental_strategy='merge'
- )
-}}
-```
-
-The three configuration fields in this example are the most important ones.
-
-- Setting `materialized='incremental'` tells dbt to apply UPSERT logic to the target table.
-- The `unique_key` should be a primary key of the target table. This is used to match records with the existing table.
-- `incremental_strategy` here is set to MERGE any existing rows in the target table with a value for the `unique_key` which matches the incoming batch of data. There are [various incremental strategies](/docs/build/incremental-models#about-incremental_strategy) for different situations and warehouses.
-
-The bulk of the work in converting a model to an incremental materialization comes in determining how the logic should change for incremental loads versus full backfills or initial loads. dbt offers a special macro, `is_incremental()`, which evaluates false for initial loads or for backfills (called full refreshes in dbt parlance), but true for incremental loads.
-
-This macro can be used to augment the model code to adjust how data is loaded for subsequent loads. How that logic should be added will depend a little bit on how data is received. Some common ways might be:
-
-1. The source table is truncated ahead of incremental loads, and only contains the data to be loaded in that increment.
-2. The source table contains all historical data, and there is a load timestamp column that identifies new data to be loaded.
-
-In the first case, the work is essentially done already. Since the source table always contains only the new data to be loaded, the query doesn’t have to change for incremental loads. The second case, however, requires the use of the `is_incremental()` macro to correctly handle the logic.
-
-Taking the converted `MERGE` statement that we’d put together previously, we’d augment it to add this additional logic:
-
-```sql
-WITH
-
-using_clause AS (
-
- SELECT
- ride_id,
- subtotal,
- tip,
- max(load_timestamp) as load_timestamp
-
- FROM {{ ref('rides_to_load') }}
-
-
- {% if is_incremental() %}
-
- WHERE load_timestamp > (SELECT max(load_timestamp) FROM {{ this }})
-
- {% endif %}
-
-),
-
-updates AS (
-
- SELECT
- ride_id,
- subtotal,
- tip,
- load_timestamp
-
- FROM using_clause
-
- {% if is_incremental() %}
-
- WHERE ride_id IN (SELECT ride_id FROM {{ this }})
-
- {% endif %}
-
-),
-
-inserts AS (
-
- SELECT
- ride_id,
- subtotal,
- NVL(tip, 0, tip),
- load_timestamp
-
- FROM using_clause
-
- WHERE ride_id NOT IN (SELECT ride_id FROM updates)
-
-)
-
-SELECT * FROM updates UNION inserts
-```
-
-There are a couple important concepts to understand here:
-
-1. The code in the `is_incremental()` conditional block only executes for incremental executions of this model code. If the target table doesn’t exist, or if the `--full-refresh` option is used, that code will not execute.
-2. `{{ this }}` is a special keyword in dbt that when used in a Jinja block, self-refers to the model for which the code is executing. So if you have a model in a file called `my_incremental_model.sql`, `{{ this }}` will refer to `my_incremental_model` (fully qualified with database and schema name if necessary). By using that keyword, we can leverage the current state of the target table to inform the source query.
diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion.md
deleted file mode 100644
index 6fddf15c163..00000000000
--- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/6-migrating-from-stored-procedures-conclusion.md
+++ /dev/null
@@ -1,6 +0,0 @@
----
-title: Putting it all together
-id: 6-migrating-from-stored-procedures-conclusion
----
-
-The techniques shared above are useful ways to get started converting the individual DML statements that are often found in stored procedures. Using these types of patterns, legacy procedural code can be rapidly transitioned to dbt models that are much more readable, maintainable, and benefit from software engineering best practices like DRY principles . Additionally, once transformations are rewritten as dbt models, it becomes much easier to test the transformations to ensure that the data being used downstream is high-quality and trustworthy.
diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md
deleted file mode 100644
index a377554c317..00000000000
--- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md
+++ /dev/null
@@ -1,55 +0,0 @@
----
-title: Airflow and dbt Cloud
-id: 1-airflow-and-dbt-cloud
----
-
-In some cases, [Airflow](https://airflow.apache.org/) may be the preferred orchestrator for your organization over working fully within dbt Cloud. There are a few reasons your team might be considering using Airflow to orchestrate your dbt jobs:
-
-- Your team is already using Airflow to orchestrate other processes
-- Your team needs to ensure that a [dbt job](https://docs.getdbt.com/docs/dbt-cloud/cloud-overview#schedule-and-run-dbt-jobs-in-production) kicks off before or after another process outside of dbt Cloud
-- Your team needs flexibility to manage more complex scheduling, such as kicking off one dbt job only after another has completed
-- Your team wants to own their own orchestration solution
-- You need code to work right now without starting from scratch
-
-## How are people using Airflow + dbt today?
-
-### Airflow + dbt Core
-
-There are so many great examples from Gitlab through their open source data engineering work. Example: [here](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py). This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details: [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/)
-
-### Airflow + dbt Cloud API w/Custom Scripts
-
-This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available: [here](https://registry.astronomer.io/providers/dbt-cloud?type=Sensors&utm_campaign=Monthly%20Product%20Updates&utm_medium=email&_hsmi=208603877&utm_content=208603877&utm_source=hs_email)
-
-There are many different permutations of this over time:
-
-- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on custom python API utilities [here](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py)
-- [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want
-- [Other ways to run dbt in airflow](/docs/deploy/deployments#airflow): Official dbt Docs on how teams are running dbt in airflow
-
-## This guide's process
-
-These solutions are great, but can be difficult to trust as your team grows and management for things like: testing, job definitions, secrets, and pipelines increase past your team’s capacity. Roles become blurry (or were never clearly defined at the start!). Both data and analytics engineers start digging through custom logging within each other’s workflows to make heads or tails of where and what the issue really is. Not to mention that when the issue is found, it can be even harder to decide on the best path forward for safely implementing fixes. This complex workflow and unclear delineation on process management results in a lot of misunderstandings and wasted time just trying to get the process to work smoothly!
-
-### A better way
-
-After today’s walkthrough, you’ll get hands-on experience:
-
-1. Creating a working local Airflow environment
-2. Invoking a dbt Cloud job with Airflow (with proof!)
-3. Reusing tested and trusted Airflow code for your specific use cases
-
-While you’re learning the ropes, you’ll also gain a better understanding of how this helps to:
-
-- Reduce the cognitive load when building and maintaining pipelines
-- Avoid dependency hell (think: `pip install` conflicts)
-- Implement better recoveries from failures
-- Define clearer workflows so that data and analytics engineers work better, together ♥️
-
-### Prerequisites
-
-- [dbt Cloud Teams or Enterprise account](https://www.getdbt.com/pricing/) (with [admin access](https://docs.getdbt.com/docs/cloud/manage-access/enterprise-permissions)) in order to create a service token. Permissions for service tokens can be found [here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens).
-- A [free Docker account](https://hub.docker.com/signup) in order to sign in to Docker Desktop, which will be installed in the initial setup.
-- A local digital scratchpad for temporarily copy-pasting API keys and URLs
-
-🙌 Let’s get started! 🙌
diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md
deleted file mode 100644
index 9c3b8eb7f1b..00000000000
--- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md
+++ /dev/null
@@ -1,90 +0,0 @@
----
-title: Setting up Airflow and dbt Cloud
-id: 2-setting-up-airflow-and-dbt-cloud
----
-
-## 1. Install the Astro CLI
-
-Astro is a managed software service that includes key features for teams working with Airflow. In order to use Astro, we’ll install the Astro CLI, which will give us access to useful commands for working with Airflow locally. You can read more about Astro [here](https://docs.astronomer.io/astro/).
-
-In this example, we’re using Homebrew to install Astro CLI. Follow the instructions to install the Astro CLI for your own operating system [here](https://docs.astronomer.io/astro/install-cli).
-
-```bash
-brew install astro
-```
-
-
-
-## 2. Install and start Docker Desktop
-
-Docker allows us to spin up an environment with all the apps and dependencies we need for the example.
-
-Follow the instructions [here](https://docs.docker.com/desktop/) to install Docker desktop for your own operating system. Once Docker is installed, ensure you have it up and running for the next steps.
-
-
-
-## 3. Clone the airflow-dbt-cloud repository
-
-Open your terminal and clone the [airflow-dbt-cloud repository](https://github.com/sungchun12/airflow-dbt-cloud.git). This contains example Airflow DAGs that you’ll use to orchestrate your dbt Cloud job. Once cloned, navigate into the `airflow-dbt-cloud` project.
-
-```bash
-git clone https://github.com/sungchun12/airflow-dbt-cloud.git
-cd airflow-dbt-cloud
-```
-
-
-
-## 4. Start the Docker container
-
-You can initialize an Astronomer project in an empty local directory using a Docker container, and then run your project locally using the `start` command.
-
-1. Run the following commands to initialize your project and start your local Airflow deployment:
-
- ```bash
- astro dev init
- astro dev start
- ```
-
- When this finishes, you should see a message similar to the following:
-
- ```bash
- Airflow is starting up! This might take a few minutes…
-
- Project is running! All components are now available.
-
- Airflow Webserver: http://localhost:8080
- Postgres Database: localhost:5432/postgres
- The default Airflow UI credentials are: admin:admin
- The default Postrgres DB credentials are: postgres:postgres
- ```
-
-2. Open the Airflow interface. Launch your web browser and navigate to the address for the **Airflow Webserver** from your output in Step 1.
-
- This will take you to your local instance of Airflow. You’ll need to log in with the **default credentials**:
-
- - Username: admin
- - Password: admin
-
- ![Airflow login screen](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-login.png)
-
-
-
-## 5. Create a dbt Cloud service token
-
-Create a service token from within dbt Cloud using the instructions [found here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). Ensure that you save a copy of the token, as you won’t be able to access this later. In this example we use `Account Admin`, but you can also use `Job Admin` instead for token permissions.
-
-
-
-## 6. Create a dbt Cloud job
-
-In your dbt Cloud account create a job, paying special attention to the information in the bullets below. Additional information for creating a dbt Cloud job can be found [here](/quickstarts/bigquery).
-
-- Configure the job with the commands that you want to include when this job kicks off, as Airflow will be referring to the job’s configurations for this rather than being explicitly coded in the Airflow DAG. This job will run a set of commands rather than a single command.
-- Ensure that the schedule is turned **off** since we’ll be using Airflow to kick things off.
-- Once you hit `save` on the job, make sure you copy the URL and save it for referencing later. The url will look similar to this:
-
-```html
-https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/
-```
-
-
diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud.md
deleted file mode 100644
index d6fd32bdba9..00000000000
--- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/3-running-airflow-and-dbt-cloud.md
+++ /dev/null
@@ -1,104 +0,0 @@
----
-title: Running Airflow and dbt Cloud
-id: 3-running-airflow-and-dbt-cloud
----
-
-
-
-Now you have all the working pieces to get up and running with Airflow + dbt Cloud. Let’s dive into make this all work together. We will **set up a connection** and **run a DAG in Airflow** that kicks off a dbt Cloud job.
-
-## 1. Add your dbt Cloud API token as a secure connection
-
-1. Navigate to Admin and click on **Connections**
-
- ![Airflow connections menu](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-connections-menu.png)
-
-2. Click on the `+` sign to add a new connection, then click on the drop down to search for the dbt Cloud Connection Type
-
- ![Create connection](/img/guides/orchestration/airflow-and-dbt-cloud/create-connection.png)
-
- ![Connection type](/img/guides/orchestration/airflow-and-dbt-cloud/connection-type.png)
-
-3. Add in your connection details and your default dbt Cloud account id. This is found in your dbt Cloud URL after the accounts route section (`/accounts/{YOUR_ACCOUNT_ID}`), for example the account with id 16173 would see this in their URL: `https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/`
-
-![https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2](https://lh3.googleusercontent.com/sRxe5xbv_LYhIKblc7eiY7AmByr1OibOac2_fIe54rpU3TBGwjMpdi_j0EPEFzM1_gNQXry7Jsm8aVw9wQBSNs1I6Cyzpvijaj0VGwSnmVf3OEV8Hv5EPOQHrwQgK2RhNBdyBxN2)
-
-## 2. Add your `job_id` and `account_id` config details to the python file: [dbt_cloud_provider_eltml.py](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/dags/dbt_cloud_provider_eltml.py)
-
-1. You’ll find these details within the dbt Cloud job URL, see the comments in the code snippet below for an example.
-
- ```python
- # dbt Cloud Job URL: https://cloud.getdbt.com/#/accounts/16173/projects/36467/jobs/65767/
- # account_id: 16173
- #job_id: 65767
-
- # line 28
- default_args={"dbt_cloud_conn_id": "dbt_cloud", "account_id": 16173},
-
- trigger_dbt_cloud_job_run = DbtCloudRunJobOperator(
- task_id="trigger_dbt_cloud_job_run",
- job_id=65767, # line 39
- check_interval=10,
- timeout=300,
- )
- ```
-
-2. Turn on the DAG and verify the job succeeded after running. Note: screenshots taken from different job runs, but the user experience is consistent.
-
- ![https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy](https://lh6.googleusercontent.com/p8AqQRy0UGVLjDGPmcuGYmQ_BRodyL0Zis-eQgSmp69EHbKW51o4S-bCl1fXHlOmwpYEBxD0A-O1Q1hwt-VDVMO1wWH-AIeaoelBx06JXRJ0m1OcHaPpFKH0xDiduIhNlQhhbLiy)
-
- ![Airflow DAG](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-dag.png)
-
- ![Task run instance](/img/guides/orchestration/airflow-and-dbt-cloud/task-run-instance.png)
-
- ![https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS](https://lh6.googleusercontent.com/S9QdGhLAdioZ3x634CChugsJRiSVtTTd5CTXbRL8ADA6nSbAlNn4zV0jb3aC946c8SGi9FRTfyTFXqjcM-EBrJNK5hQ0HHAsR5Fj7NbdGoUfBI7xFmgeoPqnoYpjyZzRZlXkjtxS)
-
-## How do I rerun the dbt Cloud job and downstream tasks in my pipeline?
-
-If you have worked with dbt Cloud before, you have likely encountered cases where a job fails. In those cases, you have likely logged into dbt Cloud, investigated the error, and then manually restarted the job.
-
-This section of the guide will show you how to restart the job directly from Airflow. This will specifically run *just* the `trigger_dbt_cloud_job_run` and downstream tasks of the Airflow DAG and not the entire DAG. If only the transformation step fails, you don’t need to re-run the extract and load processes. Let’s jump into how to do that in Airflow.
-
-1. Click on the task
-
- ![Task DAG view](/img/guides/orchestration/airflow-and-dbt-cloud/task-dag-view.png)
-
-2. Clear the task instance
-
- ![Clear task instance](/img/guides/orchestration/airflow-and-dbt-cloud/clear-task-instance.png)
-
- ![Approve clearing](/img/guides/orchestration/airflow-and-dbt-cloud/approve-clearing.png)
-
-3. Watch it rerun in real time
-
- ![Re-run](/img/guides/orchestration/airflow-and-dbt-cloud/re-run.png)
-
-## Cleaning up
-
-At the end of this guide, make sure you shut down your docker container. When you’re done using Airflow, use the following command to stop the container:
-
-```bash
-$ astrocloud dev stop
-
-[+] Running 3/3
- ⠿ Container airflow-dbt-cloud_e3fe3c-webserver-1 Stopped 7.5s
- ⠿ Container airflow-dbt-cloud_e3fe3c-scheduler-1 Stopped 3.3s
- ⠿ Container airflow-dbt-cloud_e3fe3c-postgres-1 Stopped 0.3s
-```
-
-To verify that the deployment has stopped, use the following command:
-
-```bash
-astrocloud dev ps
-```
-
-This should give you an output like this:
-
-```bash
-Name State Ports
-airflow-dbt-cloud_e3fe3c-webserver-1 exited
-airflow-dbt-cloud_e3fe3c-scheduler-1 exited
-airflow-dbt-cloud_e3fe3c-postgres-1 exited
-```
-
-
diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md
deleted file mode 100644
index 5766d8c0b79..00000000000
--- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md
+++ /dev/null
@@ -1,50 +0,0 @@
----
-title: Airflow and dbt Cloud FAQs
-id: 4-airflow-and-dbt-cloud-faqs
----
-## 1. How can we run specific subsections of the dbt DAG in Airflow?
-
-Because of the way we configured the dbt Cloud job to run in Airflow, you can leave this job to your analytics engineers to define in the job configurations from dbt Cloud. If, for example, we need to run hourly-tagged models every hour and daily-tagged models daily, we can create jobs like `Hourly Run` or `Daily Run` and utilize the commands `dbt run -s tag:hourly` and `dbt run -s tag:daily` within each, respectively. We only need to grab our dbt Cloud `account` and `job id`, configure it in an Airflow DAG with the code provided, and then we can be on your way. See more node selection options: [here](/reference/node-selection/syntax)
-
-## 2. How can I re-run models from the point of failure?
-
-You may want to parse the dbt DAG in Airflow to get the benefit of re-running from the point of failure. However, when you have hundreds of models in your DAG expanded out, it becomes useless for diagnosis and rerunning due to the overhead that comes along with creating an expansive Airflow DAG.
-
-You can’t re-run from failure natively in dbt Cloud today (feature coming!), but you can use a custom rerun parser.
-
-Using a simple python script coupled with the dbt Cloud provider, you can:
-
-- Avoid managing artifacts in a separate storage bucket(dbt Cloud does this for you)
-- Avoid building your own parsing logic
-- Get clear logs on what models you're rerunning in dbt Cloud (without hard coding step override commands)
-
-Watch the video below to see how it works!
-
-
-
-## 3. Should Airflow run one big dbt job or many dbt jobs?
-
-Overall we recommend being as purposeful and minimalistic as you can. This is because dbt manages all of the dependencies between models and the orchestration of running those dependencies in order, which in turn has benefits in terms of warehouse processing efforts.
-
-## 4. We want to kick off our dbt jobs after our ingestion tool (such as Fivetran) / data pipelines are done loading data. Any best practices around that?
-
-Our friends at Astronomer answer this question with this example: [here](https://registry.astronomer.io/dags/fivetran-dbt-cloud-census)
-
-## 5. How do you set up a CI/CD workflow with Airflow?
-
-Check out these two resources for accomplishing your own CI/CD pipeline:
-
-- [Continuous Integration with dbt Cloud](/docs/deploy/continuous-integration)
-- [Astronomer's CI/CD Example](https://docs.astronomer.io/software/ci-cd/#example-cicd-workflow)
-
-## 6. Can dbt dynamically create tasks in the DAG like Airflow can?
-
-We prefer to keep models bundled vs. unbundled. You can go this route, but if you have hundreds of dbt models, it’s more effective to let the dbt Cloud job handle the models and dependencies. Bundling provides the solution to clear observability when things go wrong - we've seen more success in having the ability to clearly see issues in a bundled dbt Cloud job than combing through the nodes of an expansive Airflow DAG. If you still have a use case for this level of control though, our friends at Astronomer answer this question [here](https://www.astronomer.io/blog/airflow-dbt-1/)!
-
-## 7. Can you trigger notifications if a dbt job fails with Airflow? Is there any way to access the status of the dbt Job to do that?
-
-Yes, either through [Airflow's email/slack](https://www.astronomer.io/guides/error-notifications-in-airflow/) functionality by itself or combined with [dbt Cloud's notifications](/docs/deploy/job-notifications), which support email and slack notifications.
-
-## 8. Are there decision criteria for how to best work with dbt Cloud and airflow?
-
-Check out this deep dive into planning your dbt Cloud + Airflow implementation [here](https://www.youtube.com/watch?v=n7IIThR8hGk)!
diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md b/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md
deleted file mode 100644
index 048fe637de0..00000000000
--- a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md
+++ /dev/null
@@ -1,45 +0,0 @@
----
-title: Customizing CI/CD
-id: 1-cicd-background
----
-
-# Creating Custom CI/CD Pipelines
-
-One of the core tenets of dbt is that analytic code should be version controlled. This provides a ton of benefit to your organization in terms of collaboration, code consistency, stability, and the ability to roll back to a prior version. There’s an additional benefit that is provided with your code hosting platform that is often overlooked or underutilized. Some of you may have experience using dbt Cloud’s [webhook functionality](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) to run a job when a PR is created. This is a fantastic capability, and meets most use cases for testing your code before merging to production. However, there are circumstances when an organization needs additional functionality, like running workflows on every commit (linting), or running workflows after a merge is complete. In this article, we will show you how to setup custom pipelines to lint your project and trigger a dbt Cloud job via the API.
-
-A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch.
-
-
-## What are pipelines?
-
-Pipelines (which are known by many names, such as workflows, actions, or build steps) are a series of pre-defined jobs that are triggered by specific events in your repository (PR created, commit pushed, branch merged, etc). Those jobs can do pretty much anything your heart desires assuming you have the proper security access and coding chops.
-
-Jobs are executed on [runners](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#runners), which are virtual servers. The runners come pre-configured with Ubuntu Linux, macOS, or Windows. That means the commands you execute are determined by the operating system of your runner. You’ll see how this comes into play later in the setup, but for now just remember that your code is executed on virtual servers that are, typically, hosted by the code hosting platform.
-
-![Diagram of how pipelines work](/img/guides/orchestration/custom-cicd-pipelines/pipeline-diagram.png)
-
-Please note, runners hosted by your code hosting platform provide a certain amount of free time. After that, billing charges may apply depending on how your account is setup. You also have the ability to host your own runners. That is beyond the scope of this article, but checkout the links below for more information if you’re interested in setting that up:
-
-- Repo-hosted runner billing information:
- - [GitHub](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions)
- - [GitLab](https://docs.gitlab.com/ee/ci/pipelines/cicd_minutes.html)
- - [Bitbucket](https://bitbucket.org/product/features/pipelines#)
-- Self-hosted runner information:
- - [GitHub](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners)
- - [GitLab](https://docs.gitlab.com/runner/)
- - [Bitbucket](https://support.atlassian.com/bitbucket-cloud/docs/runners/)
-
-Additionally, if you’re using the free tier of GitLab you can still follow this guide, but it may ask you to provide a credit card to verify your account. You’ll see something like this the first time you try to run a pipeline:
-
-![Warning from GitLab showing payment information is required](/img/guides/orchestration/custom-cicd-pipelines/gitlab-cicd-payment-warning.png)
-
-
-## How to setup pipelines
-
-This guide provides details for multiple code hosting platforms. Where steps are unique, they are presented without a selection option. If code is specific to a platform (i.e. GitHub, GitLab, Bitbucket) you will see a selection option for each.
-
-Pipelines can be triggered by various events. The [dbt Cloud webhook](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) process already triggers a run if you want to run your jobs on a merge request, so this guide focuses on running pipelines for every push and when PRs are merged. Since pushes happen frequently in a project, we’ll keep this job super simple and fast by linting with SQLFluff. The pipeline that runs on merge requests will run less frequently, and can be used to call the dbt Cloud API to trigger a specific job. This can be helpful if you have specific requirements that need to happen when code is updated in production, like running a `--full-refresh` on all impacted incremental models.
-
-Here’s a quick look at what this pipeline will accomplish:
-
-![Diagram showing the pipelines to be created and the programs involved](/img/guides/orchestration/custom-cicd-pipelines/pipeline-programs-diagram.png)
diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md b/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md
deleted file mode 100644
index 465994e4442..00000000000
--- a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md
+++ /dev/null
@@ -1,191 +0,0 @@
----
-title: Lint code on push
-id: 2-lint-on-push
----
-
-This section shows a very basic example of linting a project every time a commit is pushed to the repo. While it is simple, it shows the power of CI and can be expanded on to meet the needs of your organization.
-
-The steps below use [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. In the example, it's set to use the `snowflake` dialect, and specifically runs the rules L019, L020, L021, and L022. This is purely for demonstration purposes. You should update this to reflect your code base's [dialect](https://docs.sqlfluff.com/en/stable/dialects.html) and the [rules](https://docs.sqlfluff.com/en/stable/rules.html) you've established for your repo.
-
-### 1. Create a YAML file to define your pipeline
-
-The YAML files defined below are what tell your code hosting platform the steps to run. In this setup, you’re telling the platform to run a SQLFluff lint job every time a commit is pushed.
-
-
-
-
-In order for GitHub to know that you want to run an action, you need to have a few specific folders in your project. Add a new folder named `.github`, and within that folder add a new one named `workflows`. Your final folder structure will look like this:
-
-```sql
-my_awesome_project
-├── .github
-│ ├── workflows
-│ │ └── lint_on_push.yml
-```
-
-To define the job for our action, let’s add a new file named `lint_on_push.yml` under the `workflows` folder. This file is how we tell the GitHub runner what to execute when the job is triggered.
-
-Below I touch on the important pieces for running a dbt Cloud job, but if you want a full run-down of all the components of this YAML file checkout [this GitHub article](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on actions.
-
-**Key pieces:**
-
-- `on:` - this is used to filter when the pipeline is run. In this example we’re running it on every push except for pushes to branches named `main`. For more filters, checkout [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows).
-- `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job
-- `uses:` - remember the virtual servers we covered in the background section? They’re just empty operating systems, so there are two pieces of setup that are needed in order to access the code in your repo, and setup Python correctly on the virtual server. These two actions are called from other repos in GitHub to provide those services. For more information on them, checkout their repos: [actions/checkout](https://github.com/actions/checkout#checkout-v3) and [actions/setup-python](https://github.com/actions/setup-python#setup-python-v3).
-- `run:` - this is how we’re telling the GitHub runner to execute the Python script we defined above.
-
-```yaml
-name: lint dbt project on push
-
-on:
- push:
- branches-ignore:
- - 'main'
-
-jobs:
-# this job runs SQLFluff with a specific set of rules
- # note the dialect is set to Snowflake, so make that specific to your setup
- # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html
- lint_project:
- name: Run SQLFluff linter
- runs-on: ubuntu-latest
-
- steps:
- - uses: "actions/checkout@v3"
- - uses: "actions/setup-python@v4"
- with:
- python-version: "3.9"
- - name: Install SQLFluff
- run: "pip install sqlfluff==0.13.1"
- - name: Lint project
- run: "sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022"
-
-```
-
-
-
-
-Create a `.gitlab-ci.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file.
-
-```sql
-my_awesome_project
-├── dbt_project.yml
-├── .gitlab-ci.yml
-```
-
-**Key pieces:**
-
-- `image: python:3.9` - this defines the virtual image we’re using to run the job
-- `rules:` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event except when the branch is named `main`. Filters are very powerful to run commands on specific events, and you can find a full list in [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules).
-- `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above.
-
-```yaml
-image: python:3.9
-
-stages:
- - pre-build
-
-# this job runs SQLFluff with a specific set of rules
-# note the dialect is set to Snowflake, so make that specific to your setup
-# details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html
-lint-project:
- stage: pre-build
- rules:
- - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main'
- script:
- - pip install sqlfluff==0.13.1
- - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022
-```
-
-
-
-
-Create a `bitbucket-pipelines.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file.
-
-```sql
-my_awesome_project
-├── bitbucket-pipelines.yml
-├── dbt_project.yml
-```
-
-**Key pieces:**
-
-- `image: python:3.11.1` - this defines the virtual image we’re using to run the job
-- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `master`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/)
-- `script:` - this is how we’re telling the Bitbucket runner to execute the Python script we defined above.
-
-```yaml
-image: python:3.11.1
-
-
-pipelines:
- branches:
- '**': # this sets a wildcard to run on every branch
- - step:
- name: Lint dbt project
- script:
- - pip install sqlfluff==0.13.1
- - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022
-
- 'master': # override if your default branch doesn't run on a branch named "master"
- - step:
- script:
- - python --version
-```
-
-
-
-
-### 2. Commit and push your changes to make sure everything works
-
-After you finish creating the YAML files, commit and push your code. Doing this will trigger your pipeline for the first time! If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run.
-
-
-
-
-In your repository, click the *Actions* tab
-
-![Image showing the GitHub action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-github.png)
-
-Sample output from SQLFluff in the `Run SQLFluff linter` job:
-
-![Image showing the logs in GitHub for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-github.png)
-
-
-
-
-In the menu option go to *CI/CD > Pipelines*
-
-![Image showing the GitLab action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-gitlab.png)
-
-Sample output from SQLFluff in the `Run SQLFluff linter` job:
-
-![Image showing the logs in GitLab for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-gitlab.png)
-
-
-
-
-In the left menu pane, click on *Pipelines*
-
-![Image showing the Bitbucket action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png)
-
-Sample output from SQLFluff in the `Run SQLFluff linter` job:
-
-![Image showing the logs in Bitbucket for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png)
-
-
-
diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md b/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md
deleted file mode 100644
index 8a6f8965b87..00000000000
--- a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md
+++ /dev/null
@@ -1,131 +0,0 @@
----
-title: Run a dbt Cloud job on pull request
-id: 4-dbt-cloud-job-on-pr
----
-
-:::info Run on PR
-
-If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/slim-ci-jobs).
-This section is only for those projects that connect to their git repository using an SSH key.
-
-:::
-
-If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of Slim CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created.
-
-The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-3 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)**
-
-### 4. Create a pipeline job that runs when PRs are created
-
-
-
-For this job, we'll set it up using the `bitbucket-pipelines.yml` file as in the prior step. The YAML file will look pretty similar to our earlier job, but we’ll pass in the required variables to the Python script using `export` statements. Update this section to match your setup based on the comments in the file.
-
-**What is this pipeline going to do?**
-The setup below will trigger a dbt Cloud job to run every time a PR is opened in this repository. It will also run a fresh version of the pipeline for every commit that is made on the PR until it is merged.
-For example: If you open a PR, it will run the pipeline. If you then decide additional changes are needed, and commit/push to the PR branch, a new pipeline will run with the updated code.
-
-The following varibles control this job:
- - `DBT_JOB_BRANCH`: Tells the dbt Cloud job to run the code in the branch that created this PR
- - `DBT_JOB_SCHEMA_OVERRIDE`: Tells the dbt Cloud job to run this into a custom target schema
- - The format of this will look like: `DBT_CLOUD_PR_{REPO_KEY}_{PR_NUMBER}`
-
-
-```yaml
-image: python:3.11.1
-
-
-pipelines:
- # This job will run when pull requests are created in the repository
- pull-requests:
- '**':
- - step:
- name: 'Run dbt Cloud PR Job'
- script:
- # Check to only build if PR destination is master (or other branch).
- # Comment or remove line below if you want to run on all PR's regardless of destination branch.
- - if [ "${BITBUCKET_PR_DESTINATION_BRANCH}" != "main" ]; then printf 'PR Destination is not master, exiting.'; exit; fi
- - export DBT_URL="https://cloud.getdbt.com"
- - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job"
- - export DBT_JOB_BRANCH=$BITBUCKET_BRANCH
- - export DBT_JOB_SCHEMA_OVERRIDE="DBT_CLOUD_PR_"$BITBUCKET_PROJECT_KEY"_"$BITBUCKET_PR_ID
- - export DBT_ACCOUNT_ID=00000 # enter your account id here
- - export DBT_PROJECT_ID=00000 # enter your project id here
- - export DBT_PR_JOB_ID=00000 # enter your job id here
- - python python/run_and_monitor_dbt_job.py
-```
-
-
-
-
-### 5. Confirm the pipeline runs
-
-Now that you have a new pipeline, it's time to run it and make sure it works. Since this only triggers when a PR is created, you'll need to create a new PR on a branch that contains the code above. Once you do that, you should see a pipeline that looks like this:
-
-
-
-
-Bitbucket pipeline:
-![dbt run on PR job in Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png)
-
-dbt Cloud job:
-![dbt Cloud job showing it was triggered by Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png)
-
-
-
-
-### 6. Handle those extra schemas in your database
-
-As noted above, when the PR job runs it will create a new schema based on the PR. To avoid having your database overwhelmed with PR schemas, consider adding a "cleanup" job to your dbt Cloud account. This job can run on a scheduled basis to cleanup any PR schemas that haven't been updated/used recently.
-
-Add this as a macro to your project. It takes 2 arguments that lets you control which schema get dropped:
- - `age_in_days`: The number of days since the schema was last altered before it should be dropped (default 10 days)
- - `databse_to_clean`: The name of the database to remove schemas from
-
-```sql
-{#
- This macro finds PR schemas older than a set date and drops them
- The maco defaults to 10 days old, but can be configued with the input argument age_in_days
- Sample usage with different date:
- dbt run-operation pr_schema_cleanup --args "{'database_to_clean': 'analytics','age_in_days':'15'}"
-#}
-{% macro pr_schema_cleanup(database_to_clean, age_in_days=10) %}
-
- {% set find_old_schemas %}
- select
- 'drop schema {{ database_to_clean }}.'||schema_name||';'
- from {{ database_to_clean }}.information_schema.schemata
- where
- catalog_name = '{{ database_to_clean | upper }}'
- and schema_name ilike 'DBT_CLOUD_PR%'
- and last_altered <= (current_date() - interval '{{ age_in_days }} days')
- {% endset %}
-
- {% if execute %}
-
- {{ log('Schema drop statements:' ,True) }}
-
- {% set schema_drop_list = run_query(find_old_schemas).columns[0].values() %}
-
- {% for schema_to_drop in schema_drop_list %}
- {% do run_query(schema_to_drop) %}
- {{ log(schema_to_drop ,True) }}
- {% endfor %}
-
- {% endif %}
-
-{% endmacro %}
-```
-
-This macro goes into a dbt Cloud job that is run on a schedule. The command will look like this (text below for copy/paste):
-![dbt Cloud job showing the run operation command for the cleanup macro](/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png)
-`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"`
\ No newline at end of file
diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md b/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md
deleted file mode 100644
index 6b39c5ce405..00000000000
--- a/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md
+++ /dev/null
@@ -1,8 +0,0 @@
----
-title: Something to Consider
-id: 5-something-to-consider
----
-
-Running dbt Cloud jobs through a CI/CD pipeline is a form of job orchestration. If you also run jobs using dbt Cloud’s built in scheduler, you now have 2 orchestration tools running jobs. The risk with this is that you could run into conflicts - you can imagine a case where you are triggering a pipeline on certain actions and running scheduled jobs in dbt Cloud, you would probably run into job clashes. The more tools you have, the more you have to make sure everything talks to each other.
-
-That being said, if **the only reason you want to use pipelines is for adding a lint check or run on merge**, you might decide the pros outweigh the cons, and as such you want to go with a hybrid approach. Just keep in mind that if two processes try and run the same job at the same time, dbt Cloud will queue the jobs and run one after the other. It’s a balancing act but can be accomplished with diligence to ensure you’re orchestrating jobs in a manner that does not conflict.
\ No newline at end of file
diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md b/website/docs/guides/productionize-your-dbt-databricks-project.md
similarity index 83%
rename from website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md
rename to website/docs/guides/productionize-your-dbt-databricks-project.md
index 5da8cc6616b..b95d8ffd2dd 100644
--- a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md
+++ b/website/docs/guides/productionize-your-dbt-databricks-project.md
@@ -1,19 +1,27 @@
---
-title: Productionizing your dbt Databricks project
-id: "productionizing-your-dbt-databricks-project"
-sidebar_label: "Productionizing your dbt Databricks project"
-description: "Learn how to deliver models to end users and use best practices to maintain production data"
+title: Productionize your dbt Databricks project
+id: productionize-your-dbt-databricks-project
+description: "Learn how to deliver models to end users and use best practices to maintain production data."
+displayText: Productionize your dbt Databricks project
+hoverSnippet: Learn how to Productionize your dbt Databricks project.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'databricks'
+hide_table_of_contents: true
+tags: ['Databricks','dbt Core','dbt Cloud']
+level: 'Intermediate'
+recently_updated: true
---
+## Introduction
Welcome to the third installment of our comprehensive series on optimizing and deploying your data pipelines using Databricks and dbt Cloud. In this guide, we'll dive into delivering these models to end users while incorporating best practices to ensure that your production data remains reliable and timely.
-## Prerequisites
+### Prerequisites
-If you don't have any of the following requirements, refer to the instructions in the [setup guide](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project) to catch up:
+If you don't have any of the following requirements, refer to the instructions in the [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project) for help meeting these requirements:
-- You have [set up your Databricks and dbt Cloud environments](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project).
-- You have [optimized your dbt models for peak performance](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks).
+- You have [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project).
+- You have [optimized your dbt models for peak performance](/guides/optimize-dbt-models-on-databricks).
- You have created two catalogs in Databricks: *dev* and *prod*.
- You have created Databricks Service Principal to run your production jobs.
- You have at least one [deployment environment](/docs/deploy/deploy-environments) in dbt Cloud.
@@ -35,16 +43,16 @@ Each dbt Cloud project can have multiple deployment environments, but only one d
With your deployment environment set up, it's time to create a production job to run in your *prod* environment.
-To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/dbt-cloud-job). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently.
+To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/deploy-jobs). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently.
Leveraging dbt Cloud's job scheduler allows data teams to own the entire transformation workflow. You don't need to learn and maintain additional tools for orchestration or rely on another team to schedule code written by your team. This end-to-end ownership simplifies the deployment process and accelerates the delivery of new data products.
-Let’s [create a job](/docs/deploy/dbt-cloud-job#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog.
+Let’s [create a job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog.
1. Create a new job by clicking **Deploy** in the header, click **Jobs** and then **Create job**.
2. **Name** the job “Daily refresh”.
3. Set the **Environment** to your *production* environment.
- - This will allow the job to inherit the catalog, schema, credentials, and environment variables defined in the [setup guide](https://docs.getdbt.com/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project#defining-your-dbt-deployment-environment).
+ - This will allow the job to inherit the catalog, schema, credentials, and environment variables defined in [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project).
4. Under **Execution Settings**
- Check the **Generate docs on run** checkbox to configure the job to automatically generate project docs each time this job runs. This will ensure your documentation stays evergreen as models are added and modified.
- Select the **Run on source freshness** checkbox to configure dbt [source freshness](/docs/deploy/source-freshness) as the first step of this job. Your sources will need to be configured to [snapshot freshness information](/docs/build/sources#snapshotting-source-data-freshness) for this to drive meaningful insights.
@@ -58,7 +66,7 @@ Let’s [create a job](/docs/deploy/dbt-cloud-job#create-and-schedule-jobs) in d
- dbt build is more efficient than issuing separate commands for dbt run and dbt test separately because it will run then test each model before continuing.
- We are excluding source data because we already tested it in step 2.
- The fail-fast flag will make dbt exit immediately if a single resource fails to build. If other models are in-progress when the first model fails, then dbt will terminate the connections for these still-running models.
-5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/job-triggers). You can enter specific days and timing or create a custom cron schedule.
+5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/deploy-jobs#schedule-days). You can enter specific days and timing or create a custom cron schedule.
- If you want your dbt Cloud job scheduled by another orchestrator, like Databricks Workflows, see the [Advanced Considerations](#advanced-considerations) section below.
This is just one example of an all-or-nothing command list designed to minimize wasted computing. The [job command list](/docs/deploy/job-commands) and [selectors](/reference/node-selection/syntax) provide a lot of flexibility on how your DAG will execute. You may want to design yours to continue running certain models if others fail. You may want to set up multiple jobs to refresh models at different frequencies. See our [Job Creation Best Practices discourse](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) for more job design suggestions.
@@ -67,7 +75,7 @@ After your job is set up and runs successfully, configure your **[project artifa
This will be our main production job to refresh data that will be used by end users. Another job everyone should include in their dbt project is a continuous integration job.
-### Add a CI job
+## Add a CI job
CI/CD, or Continuous Integration and Continuous Deployment/Delivery, has become a standard practice in software development for rapidly delivering new features and bug fixes while maintaining high quality and stability. dbt Cloud enables you to apply these practices to your data transformations.
@@ -79,21 +87,21 @@ dbt allows you to write [tests](/docs/build/tests) for your data pipeline, which
2. **Development**: Running tests during development ensures that your code changes do not break existing assumptions, enabling developers to iterate faster by catching problems immediately after writing code.
3. **CI checks**: Automated CI jobs run and test your pipeline end-to end when a pull request is created, providing confidence to developers, code reviewers, and end users that the proposed changes are reliable and will not cause disruptions or data quality issues
-Your CI job will ensure that the models build properly and pass any tests applied to them. We recommend creating a separate *test* environment and having a dedicated service principal. This will ensure the temporary schemas created during CI tests are in their own catalog and cannot unintentionally expose data to other users. Repeat the [steps](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project) used to create your *prod* environment to create a *test* environment. After setup, you should have:
+Your CI job will ensure that the models build properly and pass any tests applied to them. We recommend creating a separate *test* environment and having a dedicated service principal. This will ensure the temporary schemas created during CI tests are in their own catalog and cannot unintentionally expose data to other users. Repeat the steps in [Set up your dbt project with Databricks](/guides/set-up-your-databricks-dbt-project) to create your *prod* environment to create a *test* environment. After setup, you should have:
- A catalog called *test*
- A service principal called *dbt_test_sp*
- A new dbt Cloud environment called *test* that defaults to the *test* catalog and uses the *dbt_test_sp* token in the deployment credentials
-We recommend setting up a dbt Cloud Slim CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a Slim CI job, refer to [Set up Slim CI jobs](/docs/deploy/slim-ci-jobs) for details.
+We recommend setting up a dbt Cloud CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a CI job, refer to [Set up CI jobs](/docs/deploy/ci-jobs) for details.
With dbt tests and SlimCI, you can feel confident that your production data will be timely and accurate even while delivering at high velocity.
-### Monitor your jobs
+## Monitor your jobs
Keeping a close eye on your dbt Cloud jobs is crucial for maintaining a robust and efficient data pipeline. By monitoring job performance and quickly identifying potential issues, you can ensure that your data transformations run smoothly. dbt Cloud provides three entry points to monitor the health of your project: run history, deployment monitor, and status tiles.
-The [run history](/docs/deploy/dbt-cloud-job) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu.
+The [run history](/docs/deploy/run-visibility#run-history) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu.
The deployment monitor in dbt Cloud offers a higher-level view of your run history, enabling you to gauge the health of your data pipeline over an extended period of time. This feature includes information on run durations and success rates, allowing you to identify trends in job performance, such as increasing run times or more frequent failures. The deployment monitor also highlights jobs in progress, queued, and recent failures. To access the deployment monitor click on the dbt logo in the top left corner of the dbt Cloud UI.
@@ -101,7 +109,7 @@ The deployment monitor in dbt Cloud offers a higher-level view of your run histo
By adding [status tiles](/docs/deploy/dashboard-status-tiles) to your BI dashboards, you can give stakeholders visibility into the health of your data pipeline without leaving their preferred interface. Status tiles instill confidence in your data and help prevent unnecessary inquiries or context switching. To implement dashboard status tiles, you'll need to have dbt docs with [exposures](/docs/build/exposures) defined.
-### Notifications
+## Set up notifications
Setting up [notifications](/docs/deploy/job-notifications) in dbt Cloud allows you to receive alerts via email or a Slack channel whenever a run ends. This ensures that the appropriate teams are notified and can take action promptly when jobs fail or are canceled. To set up notifications:
@@ -109,9 +117,9 @@ Setting up [notifications](/docs/deploy/job-notifications) in dbt Cloud allows y
2. Select the **Notifications** tab.
3. Choose the desired notification type (Email or Slack) and configure the relevant settings.
-If you require notifications through other means than email or Slack, you can use dbt Cloud's outbound [webhooks](/docs/deploy/webhooks) feature to relay job events to other tools. Webhooks enable you to [integrate dbt Cloud with a wide range of SaaS applications](/guides/orchestration/webhooks), extending your pipeline’s automation into other systems.
+If you require notifications through other means than email or Slack, you can use dbt Cloud's outbound [webhooks](/docs/deploy/webhooks) feature to relay job events to other tools. Webhooks enable you to integrate dbt Cloud with a wide range of SaaS applications, extending your pipeline’s automation into other systems.
-### Troubleshooting
+## Troubleshooting
When a disruption occurs in your production pipeline, it's essential to know how to troubleshoot issues effectively to minimize downtime and maintain a high degree of trust with your stakeholders.
@@ -121,15 +129,14 @@ The five key steps for troubleshooting dbt Cloud issues are:
2. Inspect the problematic file and look for an immediate fix.
3. Isolate the problem by running one model at a time in the IDE or undoing the code that caused the issue.
4. Check for problems in compiled files and logs.
-5. Seek help from the [dbt Cloud support team](/docs/dbt-support) if needed.
-Consult the [Debugging errors documentation](/guides/best-practices/debugging-errors) for a comprehensive list of error types and diagnostic methods.
+Consult the [Debugging errors documentation](/guides/debug-errors) for a comprehensive list of error types and diagnostic methods.
To troubleshoot issues with a dbt Cloud job, navigate to the "Deploy > Run History" tab in your dbt Cloud project and select the failed run. Then, expand the run steps to view [console and debug logs](/docs/deploy/run-visibility#access-logs) to review the detailed log messages. To obtain additional information, open the Artifacts tab and download the compiled files associated with the run.
If your jobs are taking longer than expected, use the [model timing](/docs/deploy/run-visibility#model-timing) dashboard to identify bottlenecks in your pipeline. Analyzing the time taken for each model execution helps you pinpoint the slowest components and optimize them for better performance. The Databricks [Query History](https://docs.databricks.com/sql/admin/query-history.html) lets you inspect granular details such as time spent in each task, rows returned, I/O performance, and execution plan.
-For more on performance tuning, see our guide on [How to Optimize and Troubleshoot dbt Models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks).
+For more on performance tuning, see our guide on [How to Optimize and Troubleshoot dbt Models on Databricks](/guides/optimize-dbt-models-on-databricks).
## Advanced considerations
@@ -149,11 +156,11 @@ Inserting dbt Cloud jobs into a Databricks Workflows allows you to chain togethe
- Logs and Run History: Accessing logs and run history becomes more convenient when using dbt Cloud.
- Monitoring and Notification Features: dbt Cloud comes equipped with monitoring and notification features like the ones described above that can help you stay informed about the status and performance of your jobs.
-To trigger your dbt Cloud job from Databricks, follow the instructions in our [Databricks Workflows to run dbt Cloud jobs guide](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs).
+To trigger your dbt Cloud job from Databricks, follow the instructions in our [Databricks Workflows to run dbt Cloud jobs guide](/guides/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs).
-### Data masking
+## Data masking
-Our [Best Practices for dbt and Unity Catalog](/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices) guide recommends using separate catalogs *dev* and *prod* for development and deployment environments, with Unity Catalog and dbt Cloud handling configurations and permissions for environment isolation. Ensuring security while maintaining efficiency in your development and deployment environments is crucial. Additional security measures may be necessary to protect sensitive data, such as personally identifiable information (PII).
+Our [Best Practices for dbt and Unity Catalog](/best-practices/dbt-unity-catalog-best-practices) guide recommends using separate catalogs *dev* and *prod* for development and deployment environments, with Unity Catalog and dbt Cloud handling configurations and permissions for environment isolation. Ensuring security while maintaining efficiency in your development and deployment environments is crucial. Additional security measures may be necessary to protect sensitive data, such as personally identifiable information (PII).
Databricks leverages [Dynamic Views](https://docs.databricks.com/data-governance/unity-catalog/create-views.html#create-a-dynamic-view) to enable data masking based on group membership. Because views in Unity Catalog use Spark SQL, you can implement advanced data masking by using more complex SQL expressions and regular expressions. You can now also apply fine grained access controls like row filters in preview and column masks in preview on tables in Databricks Unity Catalog, which will be the recommended approach to protect sensitive data once this goes GA. Additionally, in the near term, Databricks Unity Catalog will also enable Attribute Based Access Control natively, which will make protecting sensitive data at scale simpler.
@@ -180,10 +187,10 @@ Unity Catalog is a unified governance solution for your lakehouse. It provides a
To get the most out of both tools, you can use the [persist docs config](/reference/resource-configs/persist_docs) to push table and column descriptions written in dbt into Unity Catalog, making the information easily accessible to both tools' users. Keeping the descriptions in dbt ensures they are version controlled and can be reproduced after a table is dropped.
-## Additional resources
+### Related docs
- [Advanced deployments course](https://courses.getdbt.com/courses/advanced-deployment) if you want a deeper dive into these topics
- [Autoscaling CI: The intelligent Slim CI](https://docs.getdbt.com/blog/intelligent-slim-ci)
- [Trigger a dbt Cloud Job in your automated workflow with Python](https://discourse.getdbt.com/t/triggering-a-dbt-cloud-job-in-your-automated-workflow-with-python/2573)
-- [Databricks + dbt Cloud Quickstart Guide](/quickstarts/databricks)
+- [Databricks + dbt Cloud Quickstart Guide](/guides/databricks)
- Reach out to your Databricks account team to get access to preview features on Databricks.
diff --git a/website/docs/quickstarts/redshift-qs.md b/website/docs/guides/redshift-qs.md
similarity index 97%
rename from website/docs/quickstarts/redshift-qs.md
rename to website/docs/guides/redshift-qs.md
index fc7e178f163..890be27e50a 100644
--- a/website/docs/quickstarts/redshift-qs.md
+++ b/website/docs/guides/redshift-qs.md
@@ -1,9 +1,10 @@
---
title: "Quickstart for dbt Cloud and Redshift"
-id: "redshift"
-platform: 'dbt-cloud'
+id: redshift
+level: 'Beginner'
icon: 'redshift'
hide_table_of_contents: true
+tags: ['Redshift', 'dbt Cloud','Quickstart']
---
## Introduction
@@ -31,8 +32,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen
### Related content
- Learn more with [dbt Courses](https://courses.getdbt.com/collections)
-- [dbt Cloud CI job](/docs/deploy/continuous-integration)
-- [Job triggers](/docs/deploy/job-triggers)
+- [CI jobs](/docs/deploy/continuous-integration)
+- [Deploy jobs](/docs/deploy/deploy-jobs)
- [Job notifications](/docs/deploy/job-notifications)
- [Source freshness](/docs/deploy/source-freshness)
@@ -56,7 +57,7 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen
-7. You might be asked to Configure account. For the purpose of this sandbox environment, we recommend selecting “Configure account”.
+7. You might be asked to Configure account. For this sandbox environment, we recommend selecting “Configure account”.
8. Select your cluster from the list. In the **Connect to** popup, fill out the credentials from the output of the stack:
- **Authentication** — Use the default which is **Database user name and password** (NOTE: IAM authentication is not supported in dbt Cloud).
@@ -81,8 +82,7 @@ Now we are going to load our sample data into the S3 bucket that our Cloudformat
2. Now we are going to use the S3 bucket that you created with CloudFormation and upload the files. Go to the search bar at the top and type in `S3` and click on S3. There will be sample data in the bucket already, feel free to ignore it or use it for other modeling exploration. The bucket will be prefixed with `dbt-data-lake`.
-
-
+
3. Click on the `name of the bucket` S3 bucket. If you have multiple S3 buckets, this will be the bucket that was listed under “Workshopbucket” on the Outputs page.
diff --git a/website/docs/guides/migration/tools/refactoring-legacy-sql.md b/website/docs/guides/refactoring-legacy-sql.md
similarity index 92%
rename from website/docs/guides/migration/tools/refactoring-legacy-sql.md
rename to website/docs/guides/refactoring-legacy-sql.md
index 9dd66abb495..a339e523020 100644
--- a/website/docs/guides/migration/tools/refactoring-legacy-sql.md
+++ b/website/docs/guides/refactoring-legacy-sql.md
@@ -2,15 +2,24 @@
title: Refactoring legacy SQL to dbt
id: refactoring-legacy-sql
description: This guide walks through refactoring a long SQL query (perhaps from a stored procedure) into modular dbt data models.
+displayText: Creating new materializations
+hoverSnippet: Learn how to refactoring a long SQL query into modular dbt data models.
+# time_to_complete: '30 minutes' commenting out until we test
+platform: 'dbt-cloud'
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['SQL']
+level: 'Advanced'
+recently_updated: true
---
-You may have already learned how to build dbt models from scratch.
+## Introduction
-But in reality, you probably already have some queries or stored procedures that power analyses and dashboards, and now you’re wondering how to port those into dbt.
+You may have already learned how to build dbt models from scratch. But in reality, you probably already have some queries or stored procedures that power analyses and dashboards, and now you’re wondering how to port those into dbt.
There are two parts to accomplish this: migration and refactoring. In this guide we’re going to learn a process to help us turn legacy SQL code into modular dbt models.
-When migrating and refactoring code, it’s of course important to stay organized. We'll do this by following several steps (jump directly from the right sidebar):
+When migrating and refactoring code, it’s of course important to stay organized. We'll do this by following several steps:
1. Migrate your code 1:1 into dbt
2. Implement dbt sources rather than referencing raw database tables
@@ -21,9 +30,10 @@ When migrating and refactoring code, it’s of course important to stay organize
Let's get into it!
-:::info More resources.
-This guide is excerpted from the new dbt Learn On-demand Course, "Refactoring SQL for Modularity" - if you're curious, pick up the [free refactoring course here](https://courses.getdbt.com/courses/refactoring-sql-for-modularity), which includes example and practice refactoring projects. Or for a more in-depth look at migrating DDL and DML from stored procedures check out [this guide](/guides/migration/tools/migrating-from-stored-procedures/1-migrating-from-stored-procedures).
+:::info More resources
+This guide is excerpted from the new dbt Learn On-demand Course, "Refactoring SQL for Modularity" - if you're curious, pick up the [free refactoring course here](https://courses.getdbt.com/courses/refactoring-sql-for-modularity), which includes example and practice refactoring projects. Or for a more in-depth look at migrating DDL and DML from stored procedures, refer to the[Migrate from stored procedures](/guides/migrate-from-stored-procedures) guide.
:::
+
## Migrate your existing SQL code
@@ -38,7 +48,7 @@ To get going, you'll copy your legacy SQL query into your dbt project, by saving
Once you've copied it over, you'll want to `dbt run` to execute the query and populate the in your warehouse.
-If this is your first time running dbt, you may want to start with the [Introduction to dbt](/docs/introduction) and the earlier sections of the [quickstart guide](/quickstarts) before diving into refactoring.
+If this is your first time running dbt, you may want to start with the [Introduction to dbt](/docs/introduction) and the earlier sections of the [quickstart guide](/guides) before diving into refactoring.
This step may sound simple, but if you're porting over an existing set of SQL transformations to a new SQL dialect, you will need to consider how your legacy SQL dialect differs from your new SQL flavor, and you may need to modify your legacy code to get it to run at all.
@@ -59,7 +69,7 @@ This allows you to call the same table in multiple places with `{{ src('my_sourc
We start here for several reasons:
#### Source freshness reporting
-Using sources unlocks the ability to run [source freshness reporting](docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale.
+Using sources unlocks the ability to run [source freshness reporting](/docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale.
#### Easy dependency tracing
If you're migrating multiple stored procedures into dbt, with sources you can see which queries depend on the same raw tables.
@@ -206,7 +216,7 @@ This allows anyone after us to easily step through the CTEs when troubleshooting
## Port CTEs to individual data models
Rather than keep our SQL code confined to one long SQL file, we'll now start splitting it into modular + reusable [dbt data models](https://docs.getdbt.com/docs/build/models).
-Internally at dbt Labs, we follow roughly this [data modeling technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) and we [structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) accordingly.
+Internally at dbt Labs, we follow roughly this [data modeling technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) and we [structure our dbt projects](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview) accordingly.
We'll follow those structures in this walkthrough, but your team's conventions may of course differ from ours.
@@ -243,7 +253,7 @@ Under the hood, it generates comparison queries between our before and after sta
Sure, we could write our own query manually to audit these models, but using the dbt `audit_helper` package gives us a head start and allows us to identify variances more quickly.
-## Ready for refactoring practice?
+### Ready for refactoring practice?
Head to the free on-demand course, [Refactoring from Procedural SQL to dbt](https://courses.getdbt.com/courses/refactoring-sql-for-modularity) for a more in-depth refactoring example + a practice refactoring problem to test your skills.
Questions on this guide or the course? Drop a note in #learn-on-demand in [dbt Community Slack](https://getdbt.com/community).
diff --git a/website/docs/guides/orchestration/webhooks/serverless-datadog.md b/website/docs/guides/serverless-datadog.md
similarity index 66%
rename from website/docs/guides/orchestration/webhooks/serverless-datadog.md
rename to website/docs/guides/serverless-datadog.md
index cb03c72c6b5..931ba9832ab 100644
--- a/website/docs/guides/orchestration/webhooks/serverless-datadog.md
+++ b/website/docs/guides/serverless-datadog.md
@@ -1,62 +1,71 @@
---
title: "Create Datadog events from dbt Cloud results"
-id: webhooks-guide-serverless-datadog
-slug: serverless-datadog
-description: Configure a serverless app to add Datadog logs
+id: serverless-datadog
+description: Configure a serverless app to add dbt Cloud events to Datadog logs.
+hoverSnippet: Learn how to configure a serverless app to add dbt Cloud events to Datadog logs.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
-This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-models).
+## Introduction
+
+This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-job-models).
In this example, we will use [fly.io](https://fly.io) for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run).
-## Prerequisites
+### Prerequisites
+
This guide assumes some familiarity with:
- [dbt Cloud Webhooks](/docs/deploy/webhooks)
- CLI apps
- Deploying code to a serverless code runner like fly.io or AWS Lambda
-## Integration steps
-
-### 1. Clone the `dbt-cloud-webhooks-datadog` repo
+## Clone the `dbt-cloud-webhooks-datadog` repo
[This repository](https://github.com/dpguthrie/dbt-cloud-webhooks-datadog) contains the sample code for validating a webhook and creating logs in Datadog.
-### 2. Install `flyctl` and sign up for fly.io
+## Install `flyctl` and sign up for fly.io
-Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands:
+Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands:
Switch to the directory containing the repo you cloned in step 1:
-```shell
-#example: replace with your actual path
-cd ~/Documents/GitHub/dbt-cloud-webhooks-datadog
-```
+
+ ```shell
+ #example: replace with your actual path
+ cd ~/Documents/GitHub/dbt-cloud-webhooks-datadog
+ ```
Sign up for fly.io:
-```shell
-flyctl auth signup
-```
+ ```shell
+ flyctl auth signup
+ ```
Your console should show `successfully logged in as YOUR_EMAIL` when you're done, but if it doesn't then sign in to fly.io from your command line:
-```shell
-flyctl auth login
-```
+ ```shell
+ flyctl auth login
+ ```
+
+## Launch your fly.io app
-### 3. Launch your fly.io app
Launching your app publishes it to the web and makes it ready to catch webhook events:
-```shell
-flyctl launch
-```
+ ```shell
+ flyctl launch
+ ```
-You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app.
+1. You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app.
-Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-datadog`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes.
+2. Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-datadog`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes.
-Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`).
+3. Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`).
-When asked if you would like to set up Postgresql or Redis databases, type `n` for each.
+4. When asked if you would like to set up Postgresql or Redis databases, type `n` for each.
-Type `y` when asked if you would like to deploy now.
+5. Type `y` when asked if you would like to deploy now.
Sample output from the setup wizard:
@@ -86,16 +95,16 @@ Wrote config file fly.toml
### 4. Create a Datadog API Key
[Create an API Key for your Datadog account](https://docs.datadoghq.com/account_management/api-app-keys/) and make note of it and your Datadog site (e.g. `datadoghq.com`) for later.
-### 5. Configure a new webhook in dbt Cloud
-See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**.
-
-Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`)
+## Configure a new webhook in dbt Cloud
-Make note of the Webhook Secret Key for later.
+1. See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**.
+2. Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`).
+3. Make note of the Webhook Secret Key for later.
*Do not test the endpoint*; it won't work until you have stored the auth keys (next step)
-### 6. Store secrets
+## Store secrets
+
The application requires four secrets to be set, using these names:
- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission.
- `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier.
@@ -103,9 +112,10 @@ The application requires four secrets to be set, using these names:
- `DD_SITE`: The Datadog site for your organisation, e.g. `datadoghq.com`.
Set these secrets as follows, replacing `abc123` etc with actual values:
-```shell
-flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 DD_API_KEY=ghi789 DD_SITE=datadoghq.com
-```
+ ```shell
+ flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 DD_API_KEY=ghi789 DD_SITE=datadoghq.com
+ ```
+
+## Deploy your app
-### 7. Deploy your app
-After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**.
\ No newline at end of file
+After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**.
diff --git a/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md b/website/docs/guides/serverless-pagerduty.md
similarity index 87%
rename from website/docs/guides/orchestration/webhooks/serverless-pagerduty.md
rename to website/docs/guides/serverless-pagerduty.md
index 5455af60110..50cc1b2b36e 100644
--- a/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md
+++ b/website/docs/guides/serverless-pagerduty.md
@@ -1,10 +1,18 @@
---
-title: "Create PagerDuty alarms from failed dbt Cloud tasks"
-id: webhooks-guide-serverless-pagerduty
-slug: serverless-pagerduty
-description: Configure a serverless app to create PagerDuty alarms
+title: "Trigger PagerDuty alarms when dbt Cloud jobs fail"
+id: serverless-pagerduty
+description: Use webhooks to configure a serverless app to trigger PagerDuty alarms.
+hoverSnippet: Learn how to configure a serverless app that uses webhooks to trigger PagerDuty alarms.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
+
This guide will teach you how to build and host a basic Python app which will monitor dbt Cloud jobs and create PagerDuty alarms based on failure. To do this, when a dbt Cloud job completes it will:
- Check for any failed nodes (e.g. non-passing tests or errored models), and
- create a PagerDuty alarm based on those nodes by calling the PagerDuty Events API. Events are deduplicated per run ID.
@@ -13,20 +21,20 @@ This guide will teach you how to build and host a basic Python app which will mo
In this example, we will use fly.io for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run).
-## Prerequisites
+### Prerequisites
+
This guide assumes some familiarity with:
- [dbt Cloud Webhooks](/docs/deploy/webhooks)
- CLI apps
- Deploying code to a serverless code runner like fly.io or AWS Lambda
-## Integration steps
-### 1. Clone the `dbt-cloud-webhooks-pagerduty` repo
+## Clone the `dbt-cloud-webhooks-pagerduty` repo
[This repository](https://github.com/dpguthrie/dbt-cloud-webhooks-pagerduty) contains the sample code for validating a webhook and creating events in PagerDuty.
-### 2. Install `flyctl` and sign up for fly.io
+## Install `flyctl` and sign up for fly.io
Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands:
@@ -46,7 +54,7 @@ Your console should show `successfully logged in as YOUR_EMAIL` when you're done
flyctl auth login
```
-### 3. Launch your fly.io app
+## Launch your fly.io app
Launching your app publishes it to the web and makes it ready to catch webhook events:
```shell
flyctl launch
@@ -87,12 +95,12 @@ Wrote config file fly.toml
-### 4. Create a PagerDuty integration application
+## Create a PagerDuty integration application
See [PagerDuty's guide](https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgw-events-api-v2-overview#getting-started) for full instructions.
Make note of the integration key for later.
-### 5. Configure a new webhook in dbt Cloud
+## Configure a new webhook in dbt Cloud
See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**.
Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`)
@@ -101,7 +109,7 @@ Make note of the Webhook Secret Key for later.
*Do not test the endpoint*; it won't work until you have stored the auth keys (next step)
-### 6. Store secrets
+## Store secrets
The application requires three secrets to be set, using these names:
- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission.
- `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier.
@@ -112,5 +120,6 @@ Set these secrets as follows, replacing `abc123` etc with actual values:
flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 PD_ROUTING_KEY=ghi789
```
-### 7. Deploy your app
-After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**.
\ No newline at end of file
+## Deploy your app
+
+After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**.
diff --git a/website/docs/guides/set-up-ci.md b/website/docs/guides/set-up-ci.md
new file mode 100644
index 00000000000..83362094ec6
--- /dev/null
+++ b/website/docs/guides/set-up-ci.md
@@ -0,0 +1,355 @@
+---
+title: "Get started with Continuous Integration tests"
+description: Implement a CI environment for safe project validation.
+hoverSnippet: Learn how to implement a CI environment for safe project validation.
+id: set-up-ci
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['dbt Cloud', 'Orchestration', 'CI']
+level: 'Intermediate'
+recently_updated: true
+---
+
+## Introduction
+
+By validating your code _before_ it goes into production, you don't need to spend your afternoon fielding messages from people whose reports are suddenly broken.
+
+A solid CI setup is critical to preventing avoidable downtime and broken trust. dbt Cloud uses **sensible defaults** to get you up and running in a performant and cost-effective way in minimal time.
+
+After that, there's time to get fancy, but let's walk before we run.
+
+In this guide, we're going to add a **CI environment**, where proposed changes can be validated in the context of the entire project without impacting production systems. We will use a single set of deployment credentials (like the Prod environment), but models are built in a separate location to avoid impacting others (like the Dev environment).
+
+Your git flow will look like this:
+
+
+### Prerequisites
+
+As part of your initial dbt Cloud setup, you should already have Development and Production environments configured. Let's recap what each does:
+
+- Your **Development environment** powers the IDE. Each user has individual credentials, and builds into an individual dev schema. Nothing you do here impacts any of your colleagues.
+- Your **Production environment** brings the canonical version of your project to life for downstream consumers. There is a single set of deployment credentials, and everything is built into your production schema(s).
+
+## Create a new CI environment
+
+See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **CI**. Just like your existing Production environment, it will be a Deployment-type environment.
+
+When setting a Schema in the **Deployment Credentials** area, remember that dbt Cloud will automatically generate a custom schema name for each PR to ensure that they don't interfere with your deployed models. This means you can safely set the same Schema name as your Production job.
+
+### 1. Double-check your Production environment is identified
+
+Go into your existing Production environment, and ensure that the **Set as Production environment** checkbox is set. It'll make things easier later.
+
+### 2. Create a new job in the CI environment
+
+Use the **Continuous Integration Job** template, and call the job **CI Check**.
+
+In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down:
+
+- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped.
+- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs.
+
+To be able to find modified nodes, dbt needs to have something to compare against. dbt Cloud uses the last successful run of any job in your Production environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). As long as you identified your Production environment in Step 2, you won't need to touch this. If you didn't, pick the right environment from the dropdown.
+
+### 3. Test your process
+
+That's it! There are other steps you can take to be even more confident in your work, such as validating your structure follows best practices and linting your code. For more information, refer to [Get started with Continuous Integration tests](/guides/set-up-ci).
+
+To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft). Within a few seconds, you’ll see a new check appear in your git provider.
+
+### Things to keep in mind
+
+- If you make a new commit while a CI run based on older code is in progress, it will be automatically canceled and replaced with the fresh code.
+- An unlimited number of CI jobs can run at once. If 10 developers all commit code to different PRs at the same time, each person will get their own schema containing their changes. Once each PR is merged, dbt Cloud will drop that schema.
+- CI jobs will never block a production run.
+
+## Enforce best practices with dbt project evaluator
+
+dbt Project Evaluator is a package designed to identify deviations from best practices common to many dbt projects, including modeling, testing, documentation, structure and performance problems. For an introduction to the package, read its [launch blog post](/blog/align-with-dbt-project-evaluator).
+
+### 1. Install the package
+
+As with all packages, add a reference to `dbt-labs/dbt_project_evaluator` to your `packages.yml` file. See the [dbt Package Hub](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest/) for full installation instructions.
+
+### 2. Define test severity with an environment variable
+
+As noted in the [documentation](https://dbt-labs.github.io/dbt-project-evaluator/latest/ci-check/), tests in the package are set to `warn` severity by default.
+
+To have these tests fail in CI, create a new environment called `DBT_PROJECT_EVALUATOR_SEVERITY`. Set the project-wide default to `warn`, and set it to `error` in the CI environment.
+
+In your `dbt_project.yml` file, override the severity configuration:
+
+```yaml
+tests:
+dbt_project_evaluator:
+ +severity: "{{ env_var('DBT_PROJECT_EVALUATOR_SEVERITY', 'warn') }}"
+```
+
+### 3. Update your CI commands
+
+Because these tests should only run after the rest of your project has been built, your existing CI command will need to be updated to exclude the dbt_project_evaluator package. You will then add a second step which builds _only_ the package's models and tests.
+
+Update your steps to:
+
+```bash
+dbt build --select state:modified+ --exclude package:dbt_project_evaluator
+dbt build --select package:dbt_project_evaluator
+```
+
+### 4. Apply any customizations
+
+Depending on the state of your project when you roll out the evaluator, you may need to skip some tests or allow exceptions for some areas. To do this, refer to the documentation on:
+
+- [disabling tests](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/customization/)
+- [excluding groups of models from a specific test](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/exceptions/)
+- [excluding packages or sources/models based on path](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/excluding-packages-and-paths/)
+
+If you create a seed to exclude groups of models from a specific test, remember to disable the default seed and include `dbt_project_evaluator_exceptions` in your second `dbt build` command above.
+
+## Run linting checks with SQLFluff
+
+By [linting](/docs/cloud/dbt-cloud-ide/lint-format#lint) your project during CI, you can ensure that code styling standards are consistently enforced, without spending human time nitpicking comma placement.
+
+The steps below create an action/pipeline which uses [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. If you don't already have SQLFluff rules defined, check out [our recommended config file](/best-practices/how-we-style/2-how-we-style-our-sql).
+
+### 1. Create a YAML file to define your pipeline
+
+The YAML files defined below are what tell your code hosting platform the steps to run. In this setup, you’re telling the platform to run a SQLFluff lint job every time a commit is pushed.
+
+
+
+
+GitHub Actions are defined in the `.github/workflows` directory. To define the job for your action, add a new file named `lint_on_push.yml` under the `workflows` folder. Your final folder structure will look like this:
+
+```sql
+my_awesome_project
+├── .github
+│ ├── workflows
+│ │ └── lint_on_push.yml
+```
+
+**Key pieces:**
+
+- `on:` defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other trigger options, check out [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows).
+- `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job
+- `uses:` - When the Ubuntu server is created, it is completely empty. [`checkout`](https://github.com/actions/checkout#checkout-v3) and [`setup-python`](https://github.com/actions/setup-python#setup-python-v3) are public GitHub Actions which enable the server to access the code in your repo, and set up Python correctly.
+- `run:` - these steps are run at the command line, as though you typed them at a prompt yourself. This will install sqlfluff and lint the project. Be sure to set the correct `--dialect` for your project.
+
+For a full breakdown of the properties in a workflow file, see [Understanding the workflow file](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on GitHub's website.
+
+```yaml
+name: lint dbt project on push
+
+on:
+ push:
+ branches-ignore:
+ - 'main'
+
+jobs:
+ # this job runs SQLFluff with a specific set of rules
+ # note the dialect is set to Snowflake, so make that specific to your setup
+ # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html
+ lint_project:
+ name: Run SQLFluff linter
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: "actions/checkout@v3"
+ - uses: "actions/setup-python@v4"
+ with:
+ python-version: "3.9"
+ - name: Install SQLFluff
+ run: "pip install sqlfluff"
+ - name: Lint project
+ run: "sqlfluff lint models --dialect snowflake"
+
+```
+
+
+
+
+Create a `.gitlab-ci.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file.
+
+```sql
+my_awesome_project
+├── dbt_project.yml
+├── .gitlab-ci.yml
+```
+
+**Key pieces:**
+
+- `image: python:3.9` - this defines the virtual image we’re using to run the job
+- `rules:` - defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other rules, refer to [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules).
+- `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above.
+
+```yaml
+image: python:3.9
+
+stages:
+ - pre-build
+
+# this job runs SQLFluff with a specific set of rules
+# note the dialect is set to Snowflake, so make that specific to your setup
+# details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html
+lint-project:
+ stage: pre-build
+ rules:
+ - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main'
+ script:
+ - pip install sqlfluff
+ - sqlfluff lint models --dialect snowflake
+```
+
+
+
+
+Create a `bitbucket-pipelines.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file.
+
+```sql
+my_awesome_project
+├── bitbucket-pipelines.yml
+├── dbt_project.yml
+```
+
+**Key pieces:**
+
+- `image: python:3.11.1` - this defines the virtual image we’re using to run the job
+- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `main`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/)
+- `script:` - this is how we’re telling the Bitbucket runner to execute the Python script we defined above.
+
+```yaml
+image: python:3.11.1
+
+
+pipelines:
+ branches:
+ '**': # this sets a wildcard to run on every branch
+ - step:
+ name: Lint dbt project
+ script:
+ - pip install sqlfluff==0.13.1
+ - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022
+
+ 'main': # override if your default branch doesn't run on a branch named "main"
+ - step:
+ script:
+ - python --version
+```
+
+
+
+
+### 2. Commit and push your changes to make sure everything works
+
+After you finish creating the YAML files, commit and push your code to trigger your pipeline for the first time. If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run.
+
+
+
+
+In your repository, click the _Actions_ tab
+
+![Image showing the GitHub action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-github.png)
+
+Sample output from SQLFluff in the `Run SQLFluff linter` job:
+
+![Image showing the logs in GitHub for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-github.png)
+
+
+
+
+In the menu option go to *CI/CD > Pipelines*
+
+![Image showing the GitLab action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-gitlab.png)
+
+Sample output from SQLFluff in the `Run SQLFluff linter` job:
+
+![Image showing the logs in GitLab for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-gitlab.png)
+
+
+
+
+In the left menu pane, click on *Pipelines*
+
+![Image showing the Bitbucket action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png)
+
+Sample output from SQLFluff in the `Run SQLFluff linter` job:
+
+![Image showing the logs in Bitbucket for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png)
+
+
+
+
+## Advanced: Create a release train with additional environments
+
+Large and complex enterprises sometimes require additional layers of validation before deployment. Learn how to add these checks with dbt Cloud.
+
+:::caution Are you sure you need this?
+This approach can increase release safety, but creates additional manual steps in the deployment process as well as a greater maintenance burden.
+
+As such, it may slow down the time it takes to get new features into production.
+
+The team at Sunrun maintained a SOX-compliant deployment in dbt while reducing the number of environments. Check out [their Coalesce presentation](https://www.youtube.com/watch?v=vmBAO2XN-fM) to learn more.
+:::
+
+In this section, we will add a new **QA** environment. New features will branch off from and be merged back into the associated `qa` branch, and a member of your team (the "Release Manager") will create a PR against `main` to be validated in the CI environment before going live.
+
+The git flow will look like this:
+
+
+### Advanced prerequisites
+
+- You have the **Development**, **CI**, and **Production** environments, as described in [the Baseline setup](/guides/set-up-ci).
+
+### 1. Create a `release` branch in your git repo
+
+As noted above, this branch will outlive any individual feature, and will be the base of all feature development for a period of time. Your team might choose to create a new branch for each sprint (`qa/sprint-01`, `qa/sprint-02`, etc), tie it to a version of your data product (`qa/1.0`, `qa/1.1`), or just have a single `qa` branch which remains active indefinitely.
+
+### 2. Update your Development environment to use the `qa` branch
+
+See [Custom branch behavior](/docs/dbt-cloud-environments#custom-branch-behavior). Setting `qa` as your custom branch ensures that the IDE creates new branches and PRs with the correct target, instead of using `main`.
+
+
+
+### 3. Create a new QA environment
+
+See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **QA**. Just like your existing Production and CI environments, it will be a Deployment-type environment.
+
+Set its branch to `qa` as well.
+
+### 4. Create a new job
+
+Use the **Continuous Integration Job** template, and call the job **QA Check**.
+
+In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down:
+
+- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped.
+- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs.
+
+To be able to find modified nodes, dbt needs to have something to compare against. Normally, we use the Production environment as the source of truth, but in this case there will be new code merged into `qa` long before it hits the `main` branch and Production environment. Because of this, we'll want to defer the Release environment to itself.
+
+### Optional: also add a compile-only job
+
+dbt Cloud uses the last successful run of any job in that environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). If you have a lot of PRs in flight, the comparison state could switch around regularly.
+
+Adding a regularly-scheduled job inside of the QA environment whose only command is `dbt compile` can regenerate a more stable manifest for comparison purposes.
+
+### 5. Test your process
+
+When the Release Manager is ready to cut a new release, they will manually open a PR from `qa` into `main` from their git provider (e.g. GitHub, GitLab, Azure DevOps). dbt Cloud will detect the new PR, at which point the existing check in the CI environment will trigger and run. When using the [baseline configuration](/guides/set-up-ci), it's possible to kick off the PR creation from inside of the dbt Cloud IDE. Under this paradigm, that button will create PRs targeting your QA branch instead.
+
+To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft) against your `qa` branch. You'll see the integration tests begin to run. Once they complete, manually create a PR against `main`, and within a few seconds you’ll see the tests run again but this time incorporating all changes from all code that hasn't been merged to main yet.
diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md b/website/docs/guides/set-up-your-databricks-dbt-project.md
similarity index 81%
rename from website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md
rename to website/docs/guides/set-up-your-databricks-dbt-project.md
index b0be39a4273..c17c6a1f99e 100644
--- a/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md
+++ b/website/docs/guides/set-up-your-databricks-dbt-project.md
@@ -1,5 +1,18 @@
-# How to set up your Databricks and dbt project
-
+---
+title: Set up your dbt project with Databricks
+id: set-up-your-databricks-dbt-project
+description: "Learn more about setting up your dbt project with Databricks."
+displayText: Setting up your dbt project with Databricks
+hoverSnippet: Learn how to set up your dbt project with Databricks.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'databricks'
+hide_table_of_contents: true
+tags: ['Databricks', 'dbt Core','dbt Cloud']
+level: 'Intermediate'
+recently_updated: true
+---
+
+## Introduction
Databricks and dbt Labs are partnering to help data teams think like software engineering teams and ship trusted data, faster. The dbt-databricks adapter enables dbt users to leverage the latest Databricks features in their dbt project. Hundreds of customers are now using dbt and Databricks to build expressive and reliable data pipelines on the Lakehouse, generating data assets that enable analytics, ML, and AI use cases throughout the business.
@@ -7,7 +20,7 @@ In this guide, we discuss how to set up your dbt project on the Databricks Lakeh
## Configuring the Databricks Environments
-To get started, we will use Databricks’s Unity Catalog. Without it, we would not be able to design separate [environments](https://docs.getdbt.com/docs/collaborate/environments) for development and production per our [best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). It also allows us to ensure the proper access controls have been applied using SQL. You will need to be using the dbt-databricks adapter to use it (as opposed to the dbt-spark adapter).
+To get started, we will use Databricks’s Unity Catalog. Without it, we would not be able to design separate [environments](https://docs.getdbt.com/docs/collaborate/environments) for development and production per our [best practices](https://docs.getdbt.com/best-practices/how-we-structure/1-guide-overview). It also allows us to ensure the proper access controls have been applied using SQL. You will need to be using the dbt-databricks adapter to use it (as opposed to the dbt-spark adapter).
We will set up two different *catalogs* in Unity Catalog: **dev** and **prod**. A catalog is a top-level container for *schemas* (previously known as databases in Databricks), which in turn contain tables and views.
@@ -33,7 +46,7 @@ Service principals are used to remove humans from deploying to production for co
[Let’s create a service principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account) in Databricks:
1. Have your Databricks Account admin [add a service principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account) to your account. The service principal’s name should differentiate itself from a user ID and make its purpose clear (eg dbt_prod_sp).
-2. Add the service principal added to any groups it needs to be a member of at this time. There are more details on permissions in our ["Unity Catalog best practices" guide](dbt-unity-catalog-best-practices).
+2. Add the service principal added to any groups it needs to be a member of at this time. There are more details on permissions in our ["Unity Catalog best practices" guide](/best-practices/dbt-unity-catalog-best-practices).
3. [Add the service principal to your workspace](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace) and apply any [necessary entitlements](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace-using-the-admin-console), such as Databricks SQL access and Workspace access.
## Setting up Databricks Compute
@@ -55,13 +68,13 @@ We are not covering python in this post but if you want to learn more, check out
Now that the Databricks components are in place, we can configure our dbt project. This involves connecting dbt to our Databricks SQL warehouse to run SQL queries and using a version control system like GitHub to store our transformation code.
-If you are migrating an existing dbt project from the dbt-spark adapter to dbt-databricks, follow this [migration guide](https://docs.getdbt.com/guides/migration/tools/migrating-from-spark-to-databricks#migration) to switch adapters without needing to update developer credentials and other existing configs.
+If you are migrating an existing dbt project from the dbt-spark adapter to dbt-databricks, follow this [migration guide](/guides/migrate-from-spark-to-databricks) to switch adapters without needing to update developer credentials and other existing configs.
-If you’re starting a new dbt project, follow the steps below. For a more detailed setup flow, check out our [quickstart guide.](/quickstarts/databricks)
+If you’re starting a new dbt project, follow the steps below. For a more detailed setup flow, check out our [quickstart guide.](/guides/databricks)
### Connect dbt to Databricks
-First, you’ll need to connect your dbt project to Databricks so it can send transformation instructions and build objects in Unity Catalog. Follow the instructions for [dbt Cloud](/quickstarts/databricks?step=4) or [Core](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to configure your project’s connection credentials.
+First, you’ll need to connect your dbt project to Databricks so it can send transformation instructions and build objects in Unity Catalog. Follow the instructions for [dbt Cloud](/guides/databricks?step=4) or [Core](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to configure your project’s connection credentials.
Each developer must generate their Databricks PAT and use the token in their development credentials. They will also specify a unique developer schema that will store the tables and views generated by dbt runs executed from their IDE. This provides isolated developer environments and ensures data access is fit for purpose.
@@ -80,11 +93,11 @@ For your development credentials/profiles.yml:
During your first invocation of `dbt run`, dbt will create the developer schema if it doesn't already exist in the dev catalog.
-### Defining your dbt deployment environment
+## Defining your dbt deployment environment
-Last, we need to give dbt a way to deploy code outside of development environments. To do so, we’ll use dbt [environments](https://docs.getdbt.com/docs/collaborate/environments) to define the production targets that end users will interact with.
+We need to give dbt a way to deploy code outside of development environments. To do so, we’ll use dbt [environments](https://docs.getdbt.com/docs/collaborate/environments) to define the production targets that end users will interact with.
-Core projects can use [targets in profiles](https://docs.getdbt.com/docs/core/connection-profiles#understanding-targets-in-profiles) to separate environments. [dbt Cloud environments](https://docs.getdbt.com/docs/cloud/develop-in-the-cloud#set-up-and-access-the-cloud-ide) allow you to define environments via the UI and [schedule jobs](/quickstarts/databricks#create-and-run-a-job) for specific environments.
+Core projects can use [targets in profiles](https://docs.getdbt.com/docs/core/connection-profiles#understanding-targets-in-profiles) to separate environments. [dbt Cloud environments](https://docs.getdbt.com/docs/cloud/develop-in-the-cloud#set-up-and-access-the-cloud-ide) allow you to define environments via the UI and [schedule jobs](/guides/databricks#create-and-run-a-job) for specific environments.
Let’s set up our deployment environment:
@@ -94,10 +107,10 @@ Let’s set up our deployment environment:
4. Set the schema to the default for your prod environment. This can be overridden by [custom schemas](https://docs.getdbt.com/docs/build/custom-schemas#what-is-a-custom-schema) if you need to use more than one.
5. Provide your Service Principal token.
-### Connect dbt to your git repository
+## Connect dbt to your git repository
-Next, you’ll need somewhere to store and version control your code that allows you to collaborate with teammates. Connect your dbt project to a git repository with [dbt Cloud](/quickstarts/databricks#set-up-a-dbt-cloud-managed-repository). [Core](/quickstarts/manual-install#create-a-repository) projects will use the git CLI.
+Next, you’ll need somewhere to store and version control your code that allows you to collaborate with teammates. Connect your dbt project to a git repository with [dbt Cloud](/guides/databricks#set-up-a-dbt-cloud-managed-repository). [Core](/guides/manual-install#create-a-repository) projects will use the git CLI.
-## Next steps
+### Next steps
-Now that your project is configured, you can start transforming your Databricks data with dbt. To help you scale efficiently, we recommend you follow our best practices, starting with the ["Unity Catalog best practices" guide](dbt-unity-catalog-best-practices).
+Now that your project is configured, you can start transforming your Databricks data with dbt. To help you scale efficiently, we recommend you follow our best practices, starting with the [Unity Catalog best practices](/best-practices/dbt-unity-catalog-best-practices), then you can [Optimize dbt models on Databricks](/guides/optimize-dbt-models-on-databricks).
diff --git a/website/docs/guides/sl-migration.md b/website/docs/guides/sl-migration.md
new file mode 100644
index 00000000000..0cfde742af2
--- /dev/null
+++ b/website/docs/guides/sl-migration.md
@@ -0,0 +1,135 @@
+---
+title: "Legacy dbt Semantic Layer migration guide"
+id: "sl-migration"
+description: "Learn how to migrate from the legacy dbt Semantic Layer to the latest one."
+hoverSnippet: Migrate from the legacy dbt Semantic Layer to the latest one.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Semantic Layer','Migration']
+level: 'Intermediate'
+recently_updated: true
+---
+
+## Introduction
+
+The legacy Semantic Layer will be deprecated in H2 2023. Additionally, the `dbt_metrics` package will not be supported in dbt v1.6 and later. If you are using `dbt_metrics`, you'll need to upgrade your configurations before upgrading to v1.6. This guide is for people who have the legacy dbt Semantic Layer setup and would like to migrate to the new dbt Semantic Layer. The estimated migration time is two weeks.
+
+
+## Migrate metric configs to the new spec
+
+The metrics specification in dbt Core is changed in v1.6 to support the integration of MetricFlow. It's strongly recommended that you refer to [Build your metrics](/docs/build/build-metrics-intro) and before getting started so you understand the core concepts of the Semantic Layer.
+
+dbt Labs recommends completing these steps in a local dev environment (such as the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation)) instead of the dbt Cloud IDE:
+
+1. Create new Semantic Model configs as YAML files in your dbt project.*
+1. Upgrade the metrics configs in your project to the new spec.*
+1. Delete your old metrics file or remove the `.yml` file extension so they're ignored at parse time. Remove the `dbt-metrics` package from your project. Remove any macros that reference `dbt-metrics`, like `metrics.calculate()`. Make sure that any packages you’re using don't have references to the old metrics spec.
+1. Install the CLI with `pip install "dbt-metricflow[your_adapter_name]"`. For example:
+
+ ```bash
+ pip install "dbt-metricflow[snowflake]"
+ ```
+ **Note** - The MetricFlow CLI is not available in the IDE at this time. Support is coming soon.
+
+1. Run `dbt parse`. This parses your project and creates a `semantic_manifest.json` file in your target directory. MetricFlow needs this file to query metrics. If you make changes to your configs, you will need to parse your project again.
+1. Run `mf list metrics` to view the metrics in your project.
+1. Test querying a metric by running `mf query --metrics --group-by `. For example:
+ ```bash
+ mf query --metrics revenue --group-by metric_time
+ ```
+1. Run `mf validate-configs` to run semantic and warehouse validations. This ensures your configs are valid and the underlying objects exist in your warehouse.
+1. Push these changes to a new branch in your repo.
+
+**To make this process easier, dbt Labs provides a [custom migration tool](https://github.com/dbt-labs/dbt-converter) that automates these steps for you. You can find installation instructions in the [README](https://github.com/dbt-labs/dbt-converter/blob/master/README.md). Derived metrics aren’t supported in the migration tool, and will have to be migrated manually.*
+
+## Audit metric values after the migration
+
+You might need to audit metric values during the migration to ensure that the historical values of key business metrics are the same.
+
+1. In the CLI, query the metric(s) and dimensions you want to test and include the `--explain` option. For example:
+ ```bash
+ mf query --metrics orders,revenue --group-by metric_time__month,customer_type --explain
+ ```
+1. Use SQL MetricFlow to create a temporary model in your project, like `tmp_orders_revenue audit.sql`. You will use this temporary model to compare against your legacy metrics.
+1. If you haven’t already done so, create a model using `metrics.calculate()` for the metrics you want to compare against. For example:
+
+ ```bash
+ select *
+ from {{ metrics.calculate(
+ [metric('orders)',
+ metric('revenue)'],
+ grain='week',
+ dimensions=['metric_time', 'customer_type'],
+ ) }}
+ ```
+
+1. Run the [dbt-audit](https://github.com/dbt-labs/dbt-audit-helper) helper on both models to compare the metric values.
+
+## Setup the Semantic Layer in a new environment
+
+This step is only relevant to users who want the legacy and new semantic layer to run in parallel for a short time. This will let you recreate content in downstream tools like Hex and Mode with minimal downtime. If you do not need to recreate assets in these tools skip to step 5.
+
+1. Create a new deployment environment in dbt Cloud and set the dbt version to 1.6 or higher.
+
+2. Select **Only run on a custom branch** and point to the branch that has the updated metric definition.
+
+3. Set the deployment schema to a temporary migration schema, such as `tmp_sl_migration`. Optional, you can create a new database for the migration.
+
+4. Create a job to parse your project, such as `dbt parse`, and run it. Make sure this job succeeds. There needs to be a successful job in your environment in order to set up the semantic layer.
+
+5. Select **Account Settings** -> **Projects** -> **Project details** and choose **Configure the Semantic Layer**.
+
+6. Under **Environment**, select the deployment environment you created in the previous step. Save your configuration.
+
+7. In the **Project details** page, click **Generate service token** and grant it **Semantic Layer Only** and **Metadata Only** permissions. Save this token securely. You will need it to connect to the semantic layer.
+
+
+At this point, both the new semantic layer and the old semantic layer will be running. The new semantic layer will be pointing at your migration branch with the updated metrics definitions.
+
+## Update connection in downstream integrations
+
+Now that your Semantic Layer is set up, you will need to update any downstream integrations that used the legacy Semantic Layer.
+
+### Migration guide for Hex
+
+To learn more about integrating with Hex, check out their [documentation](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration#dbt-semantic-layer-integration) for more info. Additionally, refer to [dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex.
+
+1. Set up a new connection for the Semantic Layer for your account. Something to note is that your old connection will still work. The following Loom video guides you in setting up your Semantic Layer with Hex:
+
+
+
+2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer.
+
+3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API.
+
+ * **Note** — You will need to update your connection to your production environment once you merge your changes to main. Currently, this connection will be pointing at the semantic layer migration environment
+
+### Migration guide for Mode
+
+1. Set up a new connection for the semantic layer for your account. Follow [Mode's docs to setup your connection](https://mode.com/help/articles/supported-databases/#dbt-semantic-layer).
+
+2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer.
+
+3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API.
+
+## Merge your metrics migration branch to main, and upgrade your production environment to 1.6.
+
+1. Upgrade your production environment to 1.6 or higher.
+ * **Note** — The old metrics definitions are no longer valid so your dbt jobs will not pass.
+
+2. Merge your updated metrics definitions to main. **At this point the legacy semantic layer will no longer work.**
+
+If you created a new environment in [Step 3](#step-3-setup-the-semantic-layer-in-a-new-environment):
+
+3. Update your Environment in **Account Settings** -> **Project Details** -> **Edit Semantic Layer Configuration** to point to your production environment
+
+4. Delete your migration environment. Be sure to update your connection details in any downstream tools to account for the environment change.
+
+### Related docs
+
+- [MetricFlow quickstart guide](/docs/build/sl-getting-started)
+- [Example dbt project](https://github.com/dbt-labs/jaffle-sl-template)
+- [dbt metrics converter](https://github.com/dbt-labs/dbt-converter)
+- [Why we're deprecating the dbt_metrics package](/blog/deprecating-dbt-metrics) blog post
+- [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata)
diff --git a/website/docs/guides/sl-partner-integration-guide.md b/website/docs/guides/sl-partner-integration-guide.md
new file mode 100644
index 00000000000..04f58f525bd
--- /dev/null
+++ b/website/docs/guides/sl-partner-integration-guide.md
@@ -0,0 +1,165 @@
+---
+title: "Integrate with dbt Semantic Layer using best practices"
+id: "sl-partner-integration-guide"
+description: Learn about partner integration guidelines, roadmap, and connectivity.
+hoverSnippet: Learn how to integrate with the Semantic Layer using best practices
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Semantic Layer','Best practices']
+level: 'Advanced'
+recently_updated: true
+---
+
+## Introduction
+
+To fit your tool within the world of the Semantic Layer, dbt Labs offers some best practice recommendations for how to expose metrics and allow users to interact with them seamlessly.
+
+:::note
+This is an evolving guide that is meant to provide recommendations based on our experience. If you have any feedback, we'd love to hear it!
+:::
+
+
+### Prerequisites
+
+To build a dbt Semantic Layer integration:
+
+- We offer a [JDBC](/docs/dbt-cloud-apis/sl-jdbc) API and [GraphQL API](/docs/dbt-cloud-apis/sl-graphql). Refer to the dedicated [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) for more technical integration details.
+
+- Familiarize yourself with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and [MetricFlow](/docs/build/about-metricflow)'s key concepts. There are two main objects:
+
+ - [Semantic models](/docs/build/semantic-models) — Nodes in your semantic graph, connected via entities as edges. MetricFlow takes semantic models defined in YAML configuration files as inputs and creates a semantic graph that you can use to query metrics.
+ - [Metrics](/docs/build/metrics-overview) — Can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo).
+
+### Connection parameters
+
+The dbt Semantic Layer APIs authenticate with `environmentId`, `SERVICE_TOKEN`, and `host`.
+
+We recommend you provide users with separate input fields with these components for authentication (dbt Cloud will surface these parameters for the user).
+
+### Exposing metadata to dbt Labs
+
+When building an integration, we recommend you expose certain metadata in the request for analytics purposes. Among other items, it is helpful to have the following:
+
+- Your application's name (such as 'Tableau')
+- The email of the person querying your application
+- The version of dbt they are on.
+
+
+## Use best practices when exposing metrics
+
+Best practices for exposing metrics are summarized into five themes:
+
+- [Governance](#governance-and-traceability) — Recommendations on how to establish guardrails for governed data work.
+- [Discoverability](#discoverability) — Recommendations on how to make user-friendly data interactions.
+- [Organization](#organization) — Organize metrics and dimensions for all audiences.
+- [Query flexibility](#query-flexibility) — Allow users to query either one metric alone without dimensions or multiple metrics with dimensions.
+- [Context and interpretation](#context-and-interpretation) — Contextualize metrics for better analysis; expose definitions, metadata, lineage, and freshness.
+
+### Governance and traceability
+
+When working with more governed data, it's essential to establish clear guardrails. Here are some recommendations:
+
+- **Aggregations control** — Users shouldn't generally be allowed to modify aggregations unless they perform post-processing calculations on Semantic Layer data (such as year-over-year analysis).
+
+- **Time series alignment and using metric_time** — Make sure users view metrics across the correct time series. When displaying metric graphs, using a non-default time aggregation dimension might lead to misleading interpretations. While users can still group by other time dimensions, they should be careful not to create trend lines with incorrect time axes.
When looking at one or multiple metrics, users should use `metric_time` as the main time dimension to guarantee they are looking at the right time series for the metric(s).
As such, when building an application, we recommend exposing `metric_time` as a separate, "special" time dimension on its own. This dimension is always going to align with all metrics and be common across them. Other time dimensions can still be looked at and grouped by, but having a clear delineation between the `metric_time` dimension and the other time dimensions is clarifying so that people do not confuse how metrics should be plotted.
Also, when a user requests a time granularity change for the main time series, the query that your application runs should use `metric_time` as this will always give you the correct slice. Related to this, we also strongly recommend that you have a way to expose what dimension `metric_time` actually maps to for users who may not be familiar. Our APIs allow you to fetch the actual underlying time dimensions that makeup metric_time (such as `transaction_date`) so you can expose them to your users.
+
+- **Units consistency** — If units are supported, it's vital to avoid plotting data incorrectly with different units. Ensuring consistency in unit representation will prevent confusion and misinterpretation of the data.
+
+- **Traceability of metric and dimension changes** — When users change names of metrics and dimensions for reports, it's crucial to have a traceability mechanism in place to link back to the original source metric name.
+
+
+### Discoverability
+
+- Consider treating [metrics](/docs/build/metrics-overview) as first-class objects rather than measures. Metrics offer a higher-level and more contextual way to interact with data, reducing the burden on end-users to manually aggregate data.
+
+- Easy metric interactions: Provide users with an intuitive approach to:
+ * Search for Metrics — Users should be able to easily search and find relevant metrics. Metrics can serve as the starting point to lead users into exploring dimensions.
+ * Search for Dimensions — Users should be able to query metrics with associated dimensions, allowing them to gain deeper insights into the data.
+ * Filter by Dimension Values — Expose and enable users to filter metrics based on dimension values, encouraging data analysis and exploration.
+ * Filter additional metadata — Allow users to filter metrics based on other available metadata, such as metric type and default time granularity.
+
+- Suggested Metrics: Ideally, the system should intelligently suggest relevant metrics to users based on their team's activities. This approach encourages user exposure, facilitates learning, and supports collaboration among team members.
+
+By implementing these recommendations, the data interaction process becomes more user-friendly, empowering users to gain valuable insights without the need for extensive data manipulation.
+
+### Organization
+
+We recommend organizing metrics and dimensions in ways that a non-technical user can understand the data model, without needing much context:
+
+- **Organizing Dimensions** — To help non-technical users understand the data model better, we recommend organizing dimensions based on the entity they originated from. For example, consider dimensions like `user__country` and `product__category`.
You can create groups by extracting `user` and `product` and then nest the respective dimensions under each group. This way, dimensions align with the entity or semantic model they belong to and make them more user-friendly and accessible.
+
+- **Organizing Metrics** — The goal is to organize metrics into a hierarchy in our configurations, instead of presenting them in a long list.
This hierarchy helps you organize metrics based on specific criteria, such as business unit or team. By providing this structured organization, users can find and navigate metrics more efficiently, enhancing their overall data analysis experience.
+
+### Query flexibility
+
+Allow users to query either one metric alone without dimensions or multiple metrics with dimensions.
+
+- Allow toggling between metrics/dimensions seamlessly.
+
+- Be clear on exposing what dimensions are queryable with what metrics and hide things that don’t apply. (Our APIs provide calls for you to get relevant dimensions for metrics, and vice versa).
+
+- Only expose time granularities (monthly, daily, yearly) that match the available metrics.
+ * For example, if a dbt model and its resulting semantic model have a monthly granularity, make sure querying data with a 'daily' granularity isn't available to the user. Our APIs have functionality that will help you surface the correct granularities
+
+- We recommend that time granularity is treated as a general time dimension-specific concept and that it can be applied to more than just the primary aggregation (or `metric_time`). Consider a situation where a user wants to look at `sales` over time by `customer signup month`; in this situation, having the ability to apply granularities to both time dimensions is crucial. Our APIs include information to fetch the granularities for the primary (metric_time) dimensions, as well as all time dimensions. You can treat each time dimension and granularity selection independently in your application. Note: Initially, as a starting point, it makes sense to only support `metric_time` or the primary time dimension, but we recommend expanding that as your solution evolves.
+
+- You should allow users to filter on date ranges and expose a calendar and nice presets for filtering these.
+ * For example, last 30 days, last week, and so on.
+
+### Context and interpretation
+
+For better analysis, it's best to have the context of the metrics close to where the analysis is happening. We recommend the following:
+
+- Expose business definitions of the metrics as well as logical definitions.
+
+- Expose additional metadata from the Semantic layer (measures, type parameters).
+
+- Use the [Discovery API](/docs/dbt-cloud-apis/discovery-api) to enhance the metric and build confidence in its accuracy:
+ * Check if the metric is fresh and when it was last updated.
+ * Include lineage information to understand the metric's origin.
+
+- Allow for creating other metadata that’s useful for the metric. We can provide some of this information in our configuration (Display name, Default Granularity for View, Default Time range), but there may be other metadata that your tool wants to provide to make the metric richer.
+
+### Transparency and using compile
+
+For transparency and additional context, we recommend you have an easy way for the user to obtain the SQL that MetricFlow generates. Depending on what API you are using, you can do this by using our `compile` parameter. This is incredibly powerful and emphasizes transparency and openness, particularly for technically inclined users.
+
+
+### Where filters and optimization
+
+In the cases where our APIs support either a string or a filter list for the `where` clause, we always recommend that your application utilizes the filter list in order to gain maximum pushdown benefits. The `where` string may be more intuitive for users writing queries during testing, but it will not have the performance benefits of the filter list in a production environment.
+
+## Understand stages of an integration
+
+These are recommendations on how to evolve a Semantic Layer integration and not a strict runbook.
+
+**Stage 1 - The basic**
+* Supporting and using [JDBC](/docs/dbt-cloud-apis/sl-jdbc) or [GraphQL](/docs/dbt-cloud-apis/sl-graphql) is the first step. Refer to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) for more technical details.
+
+**Stage 2 - More discoverability and basic querying**
+* Support listing metrics defined in the project
+* Listing available dimensions based on one or many metrics
+* Querying defined metric values on their own or grouping by available dimensions
+* Display metadata from [Discovery API](/docs/dbt-cloud-apis/discovery-api) and other context
+
+**Stage 3 - More querying flexibility and better user experience (UX)**
+* More advanced filtering
+ * Time filters with good presets/calendar UX
+ * Filtering metrics on a pre-populated set of dimension values
+* Make dimension values more user-friendly by organizing them effectively
+* Intelligent filtering of metrics based on available dimensions and vice versa
+
+**Stage 4 - More custom user interface (UI) / Collaboration**
+* A place where users can see all the relevant information about a given metric
+* Organize metrics by hierarchy and more advanced search features (such as filter on the type of metric or other metadata)
+* Use and expose more metadata
+* Querying dimensions without metrics and other more advanced querying functionality
+* Suggest metrics to users based on teams/identity, and so on.
+
+
+### Related docs
+
+- [Use the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) to learn about the product.
+- [Build your metrics](/docs/build/build-metrics-intro) for more info about MetricFlow and its components.
+- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations.
diff --git a/website/docs/quickstarts/snowflake-qs.md b/website/docs/guides/snowflake-qs.md
similarity index 98%
rename from website/docs/quickstarts/snowflake-qs.md
rename to website/docs/guides/snowflake-qs.md
index 6d03586e611..abb18276b97 100644
--- a/website/docs/quickstarts/snowflake-qs.md
+++ b/website/docs/guides/snowflake-qs.md
@@ -1,8 +1,9 @@
---
title: "Quickstart for dbt Cloud and Snowflake"
id: "snowflake"
-platform: 'dbt-cloud'
+level: 'Beginner'
icon: 'snowflake'
+tags: ['dbt Cloud','Quickstart','Snowflake']
hide_table_of_contents: true
---
## Introduction
@@ -35,8 +36,8 @@ You can also watch the [YouTube video on dbt and Snowflake](https://www.youtube.
- Learn more with [dbt Courses](https://courses.getdbt.com/collections)
- [How we configure Snowflake](https://blog.getdbt.com/how-we-configure-snowflake/)
-- [dbt Cloud CI job](/docs/deploy/continuous-integration)
-- [Job triggers](/docs/deploy/job-triggers)
+- [CI jobs](/docs/deploy/continuous-integration)
+- [Deploy jobs](/docs/deploy/deploy-jobs)
- [Job notifications](/docs/deploy/job-notifications)
- [Source freshness](/docs/deploy/source-freshness)
@@ -138,7 +139,7 @@ There are two ways to connect dbt Cloud to Snowflake. The first option is Partne
-Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials.
+Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials.
1. In the Snowflake UI, click on the home icon in the upper left corner. In the left sidebar, select **Admin**. Then, select **Partner Connect**. Find the dbt tile by scrolling or by searching for dbt in the search bar. Click the tile to connect to dbt.
diff --git a/website/docs/quickstarts/starburst-galaxy-qs.md b/website/docs/guides/starburst-galaxy-qs.md
similarity index 99%
rename from website/docs/quickstarts/starburst-galaxy-qs.md
rename to website/docs/guides/starburst-galaxy-qs.md
index 33228710509..1822c83fa90 100644
--- a/website/docs/quickstarts/starburst-galaxy-qs.md
+++ b/website/docs/guides/starburst-galaxy-qs.md
@@ -1,9 +1,10 @@
---
title: "Quickstart for dbt Cloud and Starburst Galaxy"
id: "starburst-galaxy"
-platform: 'dbt-cloud'
+level: 'Beginner'
icon: 'starburst'
hide_table_of_contents: true
+tags: ['dbt Cloud','Quickstart']
---
## Introduction
diff --git a/website/docs/guides/advanced/using-jinja.md b/website/docs/guides/using-jinja.md
similarity index 95%
rename from website/docs/guides/advanced/using-jinja.md
rename to website/docs/guides/using-jinja.md
index 40cfd2af298..9f098bb637f 100644
--- a/website/docs/guides/advanced/using-jinja.md
+++ b/website/docs/guides/using-jinja.md
@@ -1,15 +1,25 @@
---
-title: "Using Jinja"
+title: "Use Jinja to improve your SQL code"
id: "using-jinja"
+description: "Learn how to improve your SQL code using Jinja."
+hoverSnippet: "Improve your SQL code with Jinja"
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Jinja', 'dbt Core']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
+
In this guide, we're going to take a common pattern used in SQL, and then use Jinja to improve our code.
If you'd like to work through this query, add [this CSV](https://github.com/dbt-labs/jaffle_shop/blob/core-v1.0.0/seeds/raw_payments.csv) to the `seeds/` folder of your dbt project, and then execute `dbt seed`.
While working through the steps of this model, we recommend that you have your compiled SQL open as well, to check what your Jinja compiles to. To do this:
* **Using dbt Cloud:** Click the compile button to see the compiled SQL in the right hand pane
-* **Using the dbt CLI:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once.
+* **Using dbt Core:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once.
## Write the SQL without Jinja
Consider a data model in which an `order` can have many `payments`. Each `payment` may have a `payment_method` of `bank_transfer`, `credit_card` or `gift_card`, and therefore each `order` can have multiple `payment_methods`
diff --git a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md b/website/docs/guides/zapier-ms-teams.md
similarity index 90%
rename from website/docs/guides/orchestration/webhooks/zapier-ms-teams.md
rename to website/docs/guides/zapier-ms-teams.md
index aa95b999d4c..66596d590e0 100644
--- a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md
+++ b/website/docs/guides/zapier-ms-teams.md
@@ -1,11 +1,18 @@
---
title: "Post to Microsoft Teams when a job finishes"
-id: webhooks-guide-zapier-ms-teams
-slug: zapier-ms-teams
-description: Use Zapier and the dbt Cloud API to post to Microsoft Teams
+id: zapier-ms-teams
+description: Use Zapier and dbt Cloud webhooks to post to Microsoft Teams when a job finishes running.
+hoverSnippet: Learn how to use Zapier with dbt Cloud webhooks to post in Microsoft Teams when a job finishes running.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
-This guide will show you how to set up an integration between dbt Cloud jobs and Microsoft Teams using [dbt Cloud Webhooks](/docs/deploy/webhooks) and Zapier, similar to the [native Slack integration](/faqs/accounts/slack).
+This guide will show you how to set up an integration between dbt Cloud jobs and Microsoft Teams using [dbt Cloud Webhooks](/docs/deploy/webhooks) and Zapier, similar to the [native Slack integration](/docs/deploy/job-notifications#slack-notifications).
When a dbt Cloud job finishes running, the integration will:
@@ -14,26 +21,28 @@ When a dbt Cloud job finishes running, the integration will:
- Post a summary to a Microsoft Teams channel.
![Screenshot of a message in MS Teams showing a summary of a dbt Cloud run which failed](/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-ui.png)
-## Prerequisites
+
+### Prerequisites
In order to set up the integration, you should have familiarity with:
- [dbt Cloud Webhooks](/docs/deploy/webhooks)
- Zapier
-## Integration steps
-### 1. Set up the connection between Zapier and Microsoft Teams
+
+## Set up the connection between Zapier and Microsoft Teams
* Install the [Zapier app in Microsoft Teams](https://appsource.microsoft.com/en-us/product/office/WA200002044) and [grant Zapier access to your account](https://zapier.com/blog/how-to-automate-microsoft-teams/).
**Note**: To receive the message, add the Zapier app to the team's channel during installation.
-### 2. Create a new Zap in Zapier
-Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
+## Create a new Zap in Zapier
+Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
Press **Continue**, then copy the webhook URL.
![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png)
### 3. Configure a new webhook in dbt Cloud
+
See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Choose either **Run completed** or **Run errored**, but not both, or you'll get double messages when a run fails.
Make note of the Webhook Secret Key for later.
@@ -42,14 +51,15 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test
The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development.
-### 4. Store secrets
+## Store secrets
+
In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens).
Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient).
-### 5. Add a code action
+## Add a code action
Select **Code by Zapier** as the App, and **Run Python** as the Event.
In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above.
@@ -141,19 +151,21 @@ for step in run_data_results['run_steps']:
output = {'outcome_message': outcome_message}
```
-### 6. Add the Microsoft Teams action
+## Add the Microsoft Teams action
+
Select **Microsoft Teams** as the App, and **Send Channel Message** as the Action.
In the **Set up action** area, choose the team and channel. Set the **Message Text Format** to **markdown**, then put **2. Outcome Message** from the Run Python in Code by Zapier output into the **Message Text** field.
![Screenshot of the Zapier UI, showing the mappings of prior steps to an MS Teams message](/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-zap-config.png)
-### 7. Test and deploy
+## Test and deploy
+
As you have gone through each step, you should have tested the outputs, so you can now try posting a message into your Teams channel.
When you're happy with it, remember to ensure that your `run_id` and `account_id` are no longer hardcoded, then publish your Zap.
-## Other notes
+### Other notes
- If you post to a chat instead of a team channel, you don't need to add the Zapier app to Microsoft Teams.
- If you post to a chat instead of a team channel, note that markdown is not supported and you will need to remove the markdown formatting.
- If you chose the **Catch Hook** trigger instead of **Catch Raw Hook**, you will need to pass each required property from the webhook as an input instead of running `json.loads()` against the raw body. You will also need to remove the validation code.
diff --git a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md b/website/docs/guides/zapier-new-cloud-job.md
similarity index 87%
rename from website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md
rename to website/docs/guides/zapier-new-cloud-job.md
index 49b01d0db7e..b16fa94bc21 100644
--- a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md
+++ b/website/docs/guides/zapier-new-cloud-job.md
@@ -1,28 +1,34 @@
---
title: "Trigger a dbt Cloud job after a run finishes"
-id: webhooks-guide-zapier-new-cloud-job
-slug: zapier-new-cloud-job
-description: Use Zapier to interact with the dbt Cloud API
+id: zapier-new-cloud-job
+description: Use Zapier to trigger a dbt Cloud job once a run completes.
+hoverSnippet: Learn how to use Zapier to trigger a dbt Cloud job once a run completes.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
+
This guide will show you how to trigger a dbt Cloud job based on the successful completion of a different job. This can be useful when you need to trigger a job in a different project. Remember that dbt works best when it understands the whole context of the it has been asked to run, so use this ability judiciously.
-## Prerequisites
+### Prerequisites
In order to set up the integration, you should have familiarity with:
- [dbt Cloud Webhooks](/docs/deploy/webhooks)
- Zapier
-## Integration steps
-
-### 1. Create a new Zap in Zapier
-Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
+## Create a new Zap in Zapier
+Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
Press **Continue**, then copy the webhook URL.
![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png)
-### 2. Configure a new webhook in dbt Cloud
+## Configure a new webhook in dbt Cloud
See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**, and you need to change the **Jobs** list to only contain the job you want to trigger the next run.
Make note of the Webhook Secret Key for later.
@@ -31,14 +37,14 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test
The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development.
-### 3. Store secrets
+## Store secrets
In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens).
Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient).
-### 4. Add a code action
+## Add a code action
Select **Code by Zapier** as the App, and **Run Python** as the Event.
In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above.
@@ -87,5 +93,6 @@ if hook_data['runStatus'] == "Success":
return
```
-### 5. Test and deploy
+## Test and deploy
+
When you're happy with it, remember to ensure that your `account_id` is no longer hardcoded, then publish your Zap.
diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md b/website/docs/guides/zapier-refresh-mode-report.md
similarity index 89%
rename from website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md
rename to website/docs/guides/zapier-refresh-mode-report.md
index 99680c432b3..5bab165b11d 100644
--- a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md
+++ b/website/docs/guides/zapier-refresh-mode-report.md
@@ -1,10 +1,18 @@
---
title: "Refresh a Mode dashboard when a job completes"
-id: webhooks-guide-zapier-refresh-mode-report
-slug: zapier-refresh-mode-report
-description: Use Zapier to trigger a Mode dashboard refresh
+id: zapier-refresh-mode-report
+description: Use Zapier to trigger a Mode dashboard refresh when a dbt Cloud job completes.
+hoverSnippet: Learn how to use Zapier to trigger a Mode dashboard refresh when a dbt Cloud job completes.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
+
This guide will teach you how to refresh a Mode dashboard when a dbt Cloud job has completed successfully and there is fresh data available. The integration will:
- Receive a webhook notification in Zapier
@@ -12,23 +20,21 @@ This guide will teach you how to refresh a Mode dashboard when a dbt Cloud job h
Although we are using the Mode API for a concrete example, the principles are readily transferrable to your [tool](https://learn.hex.tech/docs/develop-logic/hex-api/api-reference#operation/RunProject) [of](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/refresh-dataset) [choice](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref.htm#update_workbook_now).
-## Prerequisites
+### Prerequisites
In order to set up the integration, you should have familiarity with:
- [dbt Cloud Webhooks](/docs/deploy/webhooks)
- Zapier
- The [Mode API](https://mode.com/developer/api-reference/introduction/)
-## Integration steps
-
-### 1. Create a new Zap in Zapier
-Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
+## Create a new Zap in Zapier
+Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
Press **Continue**, then copy the webhook URL.
![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png)
-### 2. Configure a new webhook in dbt Cloud
+## Configure a new webhook in dbt Cloud
See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**, and you need to change the **Jobs** list to only contain any jobs whose completion should trigger a report refresh.
Make note of the Webhook Secret Key for later.
@@ -37,20 +43,19 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test
The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development.
-### 3. Store secrets
+## Store secrets
In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens), as well as a [Mode API token and secret](https://mode.com/developer/api-reference/authentication/).
Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient).
-
This guide assumes the names for the secret keys are: `DBT_WEBHOOK_KEY`, `MODE_API_TOKEN`, and `MODE_API_SECRET`. If you are using different names, make sure you update all references to them in the sample code.
This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3).
-#### a. Create a Storage by Zapier connection
+### a. Create a Storage by Zapier connection
If you haven't already got one, go to and create a new connection. Remember the UUID secret you generate for later.
-#### b. Add a temporary code step
+### b. Add a temporary code step
Choose **Run Python** as the Event. Run the following code:
```python
store = StoreClient('abc123') #replace with your UUID secret
@@ -60,7 +65,7 @@ store.set('MODE_API_SECRET', 'abc123') #replace with your Mode API Secret
```
Test the step. You can delete this Action when the test succeeds. The key will remain stored as long as it is accessed at least once every three months.
-### 4. Add a code action
+## Add a code action
Select **Code by Zapier** as the App, and **Run Python** as the Event.
In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above.
@@ -124,5 +129,5 @@ if hook_data['runStatus'] == "Success":
return
```
-### 5. Test and deploy
-You can iterate on the Code step by modifying the code and then running the test again. When you're happy with it, you can publish your Zap.
\ No newline at end of file
+## Test and deploy
+You can iterate on the Code step by modifying the code and then running the test again. When you're happy with it, you can publish your Zap.
diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md b/website/docs/guides/zapier-refresh-tableau-workbook.md
similarity index 90%
rename from website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md
rename to website/docs/guides/zapier-refresh-tableau-workbook.md
index 8751528565c..f614b64eaa2 100644
--- a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md
+++ b/website/docs/guides/zapier-refresh-tableau-workbook.md
@@ -1,16 +1,24 @@
---
title: "Refresh Tableau workbook with extracts after a job finishes"
-id: webhooks-guide-zapier-refresh-tableau-workbook
-slug: zapier-refresh-tableau-workbook
-description: Use Zapier to trigger a Tableau workbook refresh
+id: zapier-refresh-tableau-workbook
+description: Use Zapier to trigger a Tableau workbook refresh once a dbt Cloud job completes successfully.
+hoverSnippet: Learn how to use Zapier to trigger a Tableau workbook refresh once a dbt Cloud job completes successfully.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
+## Introduction
+
This guide will teach you how to refresh a Tableau workbook that leverages [extracts](https://help.tableau.com/current/pro/desktop/en-us/extracting_data.htm) when a dbt Cloud job has completed successfully and there is fresh data available. The integration will:
- Receive a webhook notification in Zapier
- Trigger a refresh of a Tableau workbook
-## Prerequisites
+### Prerequisites
To set up the integration, you need to be familiar with:
@@ -19,19 +27,18 @@ To set up the integration, you need to be familiar with:
- The [Tableau API](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api.htm)
- The [version](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_concepts_versions.htm#rest_api_versioning) of Tableau's REST API that is compatible with your server
-## Integration steps
-### 1. Obtain authentication credentials from Tableau
+## Obtain authentication credentials from Tableau
To authenticate with the Tableau API, obtain a [Personal Access Token](https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm) from your Tableau Server/Cloud instance. In addition, make sure your Tableau workbook uses data sources that allow refresh access, which is usually set when publishing.
-### 2. Create a new Zap in Zapier
-To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead.
+## Create a new Zap in Zapier
+To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead.
Press **Continue**, then copy the webhook URL.
![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png)
-### 3. Configure a new webhook in dbt Cloud
+## Configure a new webhook in dbt Cloud
To set up a webhook subscription for dbt Cloud, follow the instructions in [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription). For the event, choose **Run completed** and modify the **Jobs** list to include only the jobs that should trigger a report refresh.
Remember to save the Webhook Secret Key for later. Paste in the webhook URL obtained from Zapier in step 2 into the **Endpoint** field and test the endpoint.
@@ -40,7 +47,7 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test
The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development.
-### 4. Store secrets
+## Store secrets
In the next step, you will need the Webhook Secret Key from the prior step, and your Tableau authentication credentials and details. Specifically, you'll need your Tableau server/site URL, server/site name, PAT name, and PAT secret.
Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient).
@@ -49,11 +56,11 @@ This guide assumes the names for the secret keys are: `DBT_WEBHOOK_KEY`, `TABLEA
This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3).
-#### a. Create a Storage by Zapier connection
+### a. Create a Storage by Zapier connection
Create a new connection at https://zapier.com/app/connections/storage if you don't already have one and remember the UUID secret you generate for later.
-#### b. Add a temporary code step
+### b. Add a temporary code step
Choose **Run Python** as the Event and input the following code:
@@ -68,7 +75,7 @@ store.set('TABLEAU_API_TOKEN_SECRET', 'abc123') #replace with your Tableau API S
Test the step to run the code. You can delete this action when the test succeeds. The keys will remain stored as long as it is accessed at least once every three months.
-### 5. Add a code action
+## Add a code action
Select **Code by Zapier** as the App, and **Run Python** as the Event.
In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above.
@@ -161,5 +168,5 @@ refresh_trigger = requests.post(refresh_url, data=json.dumps(refresh_data), head
return {"message": "Workbook refresh has been queued"}
```
-### 6. Test and deploy
+## Test and deploy
To make changes to your code, you can modify it and test it again. When you're happy with it, you can publish your Zap.
diff --git a/website/docs/guides/orchestration/webhooks/zapier-slack.md b/website/docs/guides/zapier-slack.md
similarity index 93%
rename from website/docs/guides/orchestration/webhooks/zapier-slack.md
rename to website/docs/guides/zapier-slack.md
index d3b0473502b..61b96658f95 100644
--- a/website/docs/guides/orchestration/webhooks/zapier-slack.md
+++ b/website/docs/guides/zapier-slack.md
@@ -1,11 +1,19 @@
---
title: "Post to Slack with error context when a job fails"
-id: webhooks-guide-zapier-slack
-slug: zapier-slack
-description: Use Zapier and the dbt Cloud API to post error context to Slack
+id: zapier-slack
+description: Use a webhook or Slack message to trigger Zapier and post error context in Slack when a job fails.
+hoverSnippet: Learn how to use a webhook or Slack message to trigger Zapier to post error context in Slack when a job fails.
+# time_to_complete: '30 minutes' commenting out until we test
+icon: 'guides'
+hide_table_of_contents: true
+tags: ['Webhooks']
+level: 'Advanced'
+recently_updated: true
---
-This guide will show you how to set up an integration between dbt Cloud jobs and Slack using [dbt Cloud webhooks](/docs/deploy/webhooks) and Zapier. It builds on the native [native Slack integration](/faqs/accounts/slack) by attaching error message details of models and tests in a thread.
+## Introduction
+
+This guide will show you how to set up an integration between dbt Cloud jobs and Slack using [dbt Cloud webhooks](/docs/deploy/webhooks) and Zapier. It builds on the native [native Slack integration](/docs/deploy/job-notifications#slack-notifications) by attaching error message details of models and tests in a thread.
Note: Because there is not a webhook for Run Cancelled, you may want to keep the standard Slack integration installed to receive those notifications. You could also use the [alternative integration](#alternate-approach) that augments the native integration without replacing it.
@@ -17,21 +25,20 @@ When a dbt Cloud job finishes running, the integration will:
- Create a threaded message attached to that post which contains any reasons that the job failed
![Screenshot of a message in Slack showing a summary of a dbt Cloud run which failed](/img/guides/orchestration/webhooks/zapier-slack/slack-thread-example.png)
-## Prerequisites
+
+### Prerequisites
In order to set up the integration, you should have familiarity with:
- [dbt Cloud webhooks](/docs/deploy/webhooks)
- Zapier
-## Integration steps
-
-### 1. Create a new Zap in Zapier
-Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
-Click **Continue**, then copy the webhook URL.
+## Create a new Zap in Zapier
+1. Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead.
+2. Click **Continue**, then copy the webhook URL.
![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png)
-### 2. Configure a new webhook in dbt Cloud
+## Configure a new webhook in dbt Cloud
See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Choose **Run completed** as the Event. You can alternatively choose **Run errored**, but you will need to account for the fact that the necessary metadata [might not be available immediately](/docs/deploy/webhooks#completed-errored-event-difference).
Remember the Webhook Secret Key for later.
@@ -40,7 +47,7 @@ Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test
The sample body's values are hardcoded and not reflective of your project, but they give Zapier a correctly-shaped object during development.
-### 3. Store secrets
+## Store secrets
In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens).
Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps). This prevents your keys from being displayed as plaintext in the Zap code. You can access them with the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient).
@@ -48,7 +55,7 @@ Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8
-### 4. Add a code action
+## Add a code action
Select **Code by Zapier** as the App, and **Run Python** as the Event.
In the **Set up action** section, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the previous **Catch Raw Hook** step.
@@ -153,7 +160,7 @@ send_error_thread = len(threaded_errors_post) > 0
output = {'step_summary_post': step_summary_post, 'send_error_thread': send_error_thread, 'threaded_errors_post': threaded_errors_post}
```
-### 5. Add Slack actions in Zapier
+## Add Slack actions in Zapier
Select **Slack** as the App, and **Send Channel Message** as the Action.
In the **Action** section, choose which **Channel** to post to. Set the **Message Text** field to **2. Step Summary Post** from the Run Python in Code by Zapier output.
@@ -170,11 +177,11 @@ Add another **Send Channel Message in Slack** action. In the **Action** section,
![Screenshot of the Zapier UI, showing the mappings of prior steps to a Slack message](/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config.png)
-### 7. Test and deploy
+## Test and deploy
When you're done testing your Zap, make sure that your `run_id` and `account_id` are no longer hardcoded in the Code step, then publish your Zap.
-## Alternate approach
+## Alternately, use a dbt Cloud app Slack message to trigger Zapier
Instead of using a webhook as your trigger, you can keep the existing dbt Cloud app installed in your Slack workspace and use its messages being posted to your channel as the trigger. In this case, you can skip validating the webhook and only need to load the context from the thread.
diff --git a/website/docs/quickstarts/manual-install-qs.md b/website/docs/quickstarts/manual-install-qs.md
deleted file mode 100644
index ea3c6c7ec84..00000000000
--- a/website/docs/quickstarts/manual-install-qs.md
+++ /dev/null
@@ -1,460 +0,0 @@
----
-title: "Quickstart for dbt Core from a manual install"
-id: manual-install
-description: "Connecting your warehouse to dbt Core using the CLI."
-sidebar_label: "Manual install quickstart"
-platform: 'dbt-core'
-icon: 'fa-light fa-square-terminal'
-hide_table_of_contents: true
----
-## Introduction
-
-When you use dbt Core to work with dbt, you will be editing files locally using a code editor, and running projects using the dbt command line interface (dbt CLI). If you'd rather edit files and run projects using the web-based Integrated Development Environment (IDE), you should refer to the [dbt Cloud quickstarts](/quickstarts).
-
-### Prerequisites
-
-* To use the dbt CLI, it's important that you know some basics of the Terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily.
-* Install dbt Core using the [installation instructions](/docs/core/installation) for your operating system.
-* Complete [Setting up (in BigQuery)](/quickstarts/bigquery?step=2) and [Loading data (BigQuery)](/quickstarts/bigquery?step=3).
-* [Create a GitHub account](https://github.com/join) if you don't already have one.
-
-## Create a starter project
-
-After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models.
-
-### Create a repository
-
-The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join).
-
-1. [Create a new GitHub repository](https://github.com/new) named `dbt-tutorial`.
-2. Select **Public** so the repository can be shared with others. You can always make it private later.
-3. Leave the default values for all other settings.
-4. Click **Create repository**.
-5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes).
-
-### Create a project
-
-Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project.
-
-To create your dbt project:
-
-1. Make sure you have dbt Core installed and check the version using the `dbt --version` command:
-
- ```terminal
- dbt --version
- ```
-
-2. Initiate the `jaffle_shop` project using the `init` command:
-
- ```terminal
- dbt init jaffle_shop
- ```
-
-3. Navigate into your project's directory:
-
- ```terminal
- cd jaffle_shop
- ```
-
-4. Use `pwd` to confirm that you are in the right spot:
-
- ```terminal
- $ pwd
- > Users/BBaggins/dbt-tutorial/jaffle_shop
- ```
-
-5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command.
-
-
-
-
-
-6. Update the following values in the `dbt_project.yml` file:
-
-
-
- ```yaml
- name: jaffle_shop # Change from the default, `my_new_project`
-
- ...
-
- profile: jaffle_shop # Change from the default profile name, `default`
-
- ...
-
- models:
- jaffle_shop: # Change from `my_new_project` to match the previous value for `name:`
- ...
- ```
-
-
-
-### Connect to BigQuery
-
-When developing locally, dbt connects to your using a [profile](/docs/core/connect-data-platform/connection-profiles), which is a YAML file with all the connection details to your warehouse.
-
-1. Create a file in the `~/.dbt/` directory named `profiles.yml`.
-2. Move your BigQuery keyfile into this directory.
-3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted.
-
-
-
- ```yaml
- jaffle_shop: # this needs to match the profile in your dbt_project.yml file
- target: dev
- outputs:
- dev:
- type: bigquery
- method: service-account
- keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile
- project: grand-highway-265418 # Replace this with your project id
- dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo
- threads: 1
- timeout_seconds: 300
- location: US
- priority: interactive
- ```
-
-
-
-4. Run the `debug` command from your project to confirm that you can successfully connect:
-
- ```terminal
- $ dbt debug
- > Connection test: OK connection ok
- ```
-
-
-
-
-
-#### FAQs
-
-
-
-
-
-
-
-### Perform your first dbt run
-
-Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order.
-
-1. Enter the `run` command to build example models:
-
- ```terminal
- dbt run
- ```
-
-You should have an output that looks like this:
-
-
-
-
-### Commit your changes
-
-Commit your changes so that the repository contains the latest code.
-
-1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository).
-
- ```terminal
- git init
- git branch -M main
- git add .
- git commit -m "Create a dbt project"
- git remote add origin https://github.com/USERNAME/dbt-tutorial.git
- git push -u origin main
- ```
-
-2. Return to your GitHub repository to verify your new files have been added.
-
-## Build your first models
-
-Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! You will take a sample query and turn it into a model in your dbt project.
-
-### Checkout a new git branch
-
-Check out a new git branch to work on new code:
-
-1. Create a new branch by using the `checkout` command and passing the `-b` flag:
-
- ```terminal
- $ git checkout -b add-customers-model
- > Switched to a new branch `add-customer-model`
- ```
-
-### Build your first model
-
-1. Open your project in your favorite code editor.
-2. Create a new SQL file in the `models` directory, named `models/customers.sql`.
-3. Paste the following query into the `models/customers.sql` file.
-
-
-
-4. From the command line, enter `dbt run`.
-
-
-
-
-When you return to the BigQuery console, you can `select` from this model.
-
-#### FAQs
-
-
-
-
-
-
-
-### Change the way your model is materialized
-
-
-
-
-
-### Delete the example models
-
-
-
-### Build models on top of other models
-
-
-
-1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query.
-2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query.
-
-
-
-
-
-
-
- ```sql
- select
- id as customer_id,
- first_name,
- last_name
-
- from `dbt-tutorial`.jaffle_shop.customers
- ```
-
-
-
-
-
- ```sql
- select
- id as order_id,
- user_id as customer_id,
- order_date,
- status
-
- from `dbt-tutorial`.jaffle_shop.orders
- ```
-
-
-
-
-
-
-
-
-
- ```sql
- select
- id as customer_id,
- first_name,
- last_name
-
- from jaffle_shop_customers
- ```
-
-
-
-
-
- ```sql
- select
- id as order_id,
- user_id as customer_id,
- order_date,
- status
-
- from jaffle_shop_orders
- ```
-
-
-
-
-
-
-
-
-
- ```sql
- select
- id as customer_id,
- first_name,
- last_name
-
- from jaffle_shop.customers
- ```
-
-
-
-
-
- ```sql
- select
- id as order_id,
- user_id as customer_id,
- order_date,
- status
-
- from jaffle_shop.orders
- ```
-
-
-
-
-
-
-
-
-
- ```sql
- select
- id as customer_id,
- first_name,
- last_name
-
- from raw.jaffle_shop.customers
- ```
-
-
-
-
-
- ```sql
- select
- id as order_id,
- user_id as customer_id,
- order_date,
- status
-
- from raw.jaffle_shop.orders
- ```
-
-
-
-
-
-
-
-3. Edit the SQL in your `models/customers.sql` file as follows:
-
-
-
- ```sql
- with customers as (
-
- select * from {{ ref('stg_customers') }}
-
- ),
-
- orders as (
-
- select * from {{ ref('stg_orders') }}
-
- ),
-
- customer_orders as (
-
- select
- customer_id,
-
- min(order_date) as first_order_date,
- max(order_date) as most_recent_order_date,
- count(order_id) as number_of_orders
-
- from orders
-
- group by 1
-
- ),
-
- final as (
-
- select
- customers.customer_id,
- customers.first_name,
- customers.last_name,
- customer_orders.first_order_date,
- customer_orders.most_recent_order_date,
- coalesce(customer_orders.number_of_orders, 0) as number_of_orders
-
- from customers
-
- left join customer_orders using (customer_id)
-
- )
-
- select * from final
-
- ```
-
-
-
-4. Execute `dbt run`.
-
- This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies.
-
-#### FAQs {#faq-2}
-
-
-
-
-
-### Next steps
-
-
-
-You can also explore:
-
-* The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL.
-* The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the select statements that are running and the python logging happening when dbt runs.
-
-## Test and document your project
-
-### Add tests to your models
-
-
-
-### Document your models
-
-
-
-3. Run `dbt docs serve` command to launch the documentation in a local website.
-
-#### FAQs
-
-
-
-
-
-#### Next steps
-
-
-
-### Commit updated changes
-
-You need to commit the changes you made to the project so that the repository has your latest code.
-
-1. Add all your changes to git: `git add -A`
-2. Commit your changes: `git commit -m "Add customers model, tests, docs"`
-3. Push your changes to your repository: `git push`
-4. Navigate to your repository, and open a pull request to merge the code into your master branch.
-
-## Schedule a job
-
-We recommend using dbt Cloud to schedule a job. For more information about using dbt Core to schedule a job, see [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post or [deployments](/docs/deploy/deployments).
diff --git a/website/docs/reference/analysis-properties.md b/website/docs/reference/analysis-properties.md
index 008da70f9db..880aeddbb0d 100644
--- a/website/docs/reference/analysis-properties.md
+++ b/website/docs/reference/analysis-properties.md
@@ -2,7 +2,9 @@
title: Analysis properties
---
-We recommend you define analysis properties in your `analyses/` directory, which is illustrated in the [`analysis-paths`](/reference/project-configs/analysis-paths) configuration.
+import PropsCallout from '/snippets/_config-prop-callout.md';
+
+We recommend you define analysis properties in your `analyses/` directory, which is illustrated in the [`analysis-paths`](/reference/project-configs/analysis-paths) configuration.
You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `analyses/` or `models/` directory.
@@ -28,10 +30,3 @@ analyses:
```
-
-
-
-
-* `v0.16.0`: The ability to declare analysis properties was introduced.
-
-
diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md
index b20c1548d99..859fde7c908 100644
--- a/website/docs/reference/artifacts/dbt-artifacts.md
+++ b/website/docs/reference/artifacts/dbt-artifacts.md
@@ -3,12 +3,15 @@ title: "About dbt artifacts"
sidebar_label: "About dbt artifacts"
---
-With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power:
+With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`semantic_manifest.json`, `manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power:
+
- [documentation](/docs/collaborate/documentation)
- [state](/reference/node-selection/syntax#about-node-selection)
- [visualizing source freshness](/docs/build/sources#snapshotting-source-data-freshness)
They could also be used to:
+
+- gain insights into your [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl)
- calculate project-level test coverage
- perform longitudinal analysis of run timing
- identify historical changes in structure
@@ -19,6 +22,7 @@ dbt has produced artifacts since the release of dbt-docs in v0.11.0. Starting in
## When are artifacts produced?
Most dbt commands (and corresponding RPC methods) produce artifacts:
+- [semantic manifest](/docs/dbt-cloud-apis/sl-manifest): Lives in the `/target` directory of your dbt project and stores various artifacts (such as compiled models and tests) generated during the execution of your project.
- [manifest](/reference/artifacts/manifest-json): produced by commands that read and understand your project
- [run results](/reference/artifacts/run-results-json): produced by commands that run, compile, or catalog nodes in your DAG
- [catalog](catalog-json): produced by `docs generate`
@@ -26,8 +30,6 @@ Most dbt commands (and corresponding RPC methods) produce artifacts:
## Common metadata
-New in v0.19.0
-
All artifacts produced by dbt include a `metadata` dictionary with these properties:
- `dbt_version`: Version of dbt that produced this artifact.
diff --git a/website/docs/reference/artifacts/manifest-json.md b/website/docs/reference/artifacts/manifest-json.md
index 3a916ed6d4c..47a9849eda5 100644
--- a/website/docs/reference/artifacts/manifest-json.md
+++ b/website/docs/reference/artifacts/manifest-json.md
@@ -3,15 +3,9 @@ title: "Manifest JSON file"
sidebar_label: "Manifest"
---
-| dbt Core version | Manifest version |
-|------------------|---------------------------------------------------------------|
-| v1.6 | [v10](https://schemas.getdbt.com/dbt/manifest/v10/index.html) |
-| v1.5 | [v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html) |
-| v1.4 | [v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html) |
-| v1.3 | [v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html) |
-| v1.2 | [v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html) |
-| v1.1 | [v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html) |
-| v1.0 | [v4](https://schemas.getdbt.com/dbt/manifest/v4/index.html) |
+import ManifestVersions from '/snippets/_manifest-versions.md';
+
+
**Produced by:** Any command that parses your project. This includes all commands **except** [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), [`init`](/reference/commands/init)
@@ -53,12 +47,4 @@ You can refer to [dbt JSON Schema](https://schemas.getdbt.com/) for info on desc
**Note**: The `manifest.json` version number is related to (but not _equal_ to) your dbt version, so you _must_ use the correct `manifest.json` version for your dbt version. To find the correct `manifest.json` version, select the dbt version on the top navigation (such as `v1.5`).
-Use the following table to understand how the versioning pattern works and match the Manifest version with the dbt version:
-
-| dbt version | Manifest version |
-| ----------- | ---------------- |
-| `v1.5` | [Manifest v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html)
-| `v1.4` | [Manifest v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html)
-| `v1.3` | [Manifest v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html)
-| `v1.2` | [Manifest v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html)
-| `v1.1` | [Manifest v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html)
+Refer to the table at the beginning of [this page](/reference/artifacts/manifest-json) to understand how the Manifest version matches the dbt version.
diff --git a/website/docs/reference/artifacts/other-artifacts.md b/website/docs/reference/artifacts/other-artifacts.md
index d776bc8a099..205bdfc1a14 100644
--- a/website/docs/reference/artifacts/other-artifacts.md
+++ b/website/docs/reference/artifacts/other-artifacts.md
@@ -39,4 +39,8 @@ This file is useful for investigating performance issues in dbt Core's graph alg
It is more anonymized and compact than [`manifest.json`](/reference/artifacts/manifest-json) and [`graph.gpickle`](#graph.gpickle).
-It contains only the `name` and `type` of each node along with IDs of its child nodes (`succ`). It includes that information at two separate points in time: immediately after the graph is linked together (`linked`), and after test edges have been added (`with_test_edges`).
+It includes that information at two separate points in time:
+1. `linked` — immediately after the graph is linked together, and
+2. `with_test_edges` — after test edges have been added.
+
+Each of those points in time contains the `name` and `type` of each node and `succ` contains the keys of its child nodes.
diff --git a/website/docs/reference/artifacts/run-results-json.md b/website/docs/reference/artifacts/run-results-json.md
index dd92a9c4e53..5b3549db55b 100644
--- a/website/docs/reference/artifacts/run-results-json.md
+++ b/website/docs/reference/artifacts/run-results-json.md
@@ -3,7 +3,7 @@ title: "Run results JSON file"
sidebar_label: "Run results"
---
-**Current schema**: [`v4`](https://schemas.getdbt.com/dbt/run-results/v4/index.html)
+**Current schema**: [`v5`](https://schemas.getdbt.com/dbt/run-results/v5/index.html)
**Produced by:**
[`build`](/reference/commands/build)
diff --git a/website/docs/reference/commands/clean.md b/website/docs/reference/commands/clean.md
index 0185b701740..23a3f6080ce 100644
--- a/website/docs/reference/commands/clean.md
+++ b/website/docs/reference/commands/clean.md
@@ -4,12 +4,6 @@ sidebar_label: "clean"
id: "clean"
---
-
-
-- **v1.0.0:** `dbt_modules` has been replaced by `dbt_packages` by default for the [clean-target](/reference/project-configs/clean-targets) for packages.
-
-
-
`dbt clean` is a utility function that deletes all folders specified in the [`clean-targets`](/reference/project-configs/clean-targets) list specified in `dbt_project.yml`. You can use this to delete the `dbt_packages` and `target` directories.
To avoid complex permissions issues and potentially deleting crucial aspects of the remote file system without access to fix them, this command does not work when interfacing with the RPC server that powers the dbt Cloud IDE. Instead, when working in dbt Cloud, the `dbt deps` command cleans before it installs packages automatically. The `target` folder can be manually deleted from the sidebar file tree if needed.
diff --git a/website/docs/reference/commands/clone.md b/website/docs/reference/commands/clone.md
index 32c8a89be04..6bdc2c02e07 100644
--- a/website/docs/reference/commands/clone.md
+++ b/website/docs/reference/commands/clone.md
@@ -13,15 +13,16 @@ The `dbt clone` command clones selected nodes from the [specified state](/refere
The `clone` command is useful for:
- blue/green continuous deployment (on data warehouses that support zero-copy cloning tables)
- cloning current production state into development schema(s)
-- handling incremental models in Slim CI dbt Cloud jobs (on data warehouses that support zero-copy cloning tables)
+- handling incremental models in dbt Cloud CI jobs (on data warehouses that support zero-copy cloning tables)
- testing code changes on downstream dependencies in your BI tool
+
```bash
# clone all of my models from specified state to my target schema(s)
dbt clone --state path/to/artifacts
# clone one_specific_model of my models from specified state to my target schema(s)
-dbt clone --select one_specific_model --state path/to/artifacts
+dbt clone --select "one_specific_model" --state path/to/artifacts
# clone all of my models from specified state to my target schema(s) and recreate all pre-existing relations in the current target
dbt clone --state path/to/artifacts --full-refresh
@@ -37,3 +38,19 @@ Unlike deferral, `dbt clone` requires some compute and creation of additional ob
For example, by creating actual data warehouse objects, `dbt clone` allows you to test out your code changes on downstream dependencies _outside of dbt_ (such as a BI tool).
As another example, you could `clone` your modified incremental models as the first step of your dbt Cloud CI job to prevent costly `full-refresh` builds for warehouses that support zero-copy cloning.
+
+## Cloning in dbt Cloud
+
+You can clone nodes between states in dbt Cloud using the `dbt clone` command. This is available in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) and the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) and relies on the [`--defer`](/reference/node-selection/defer) feature. For more details on defer in dbt Cloud, read [Using defer in dbt Cloud](/docs/cloud/about-cloud-develop-defer).
+
+- **Using dbt Cloud CLI** — The `dbt clone` command in the dbt Cloud CLI automatically includes the `--defer` flag. This means you can use the `dbt clone` command without any additional setup.
+
+- **Using dbt Cloud IDE** — To use the `dbt clone` command in the dbt Cloud IDE, follow these steps before running the `dbt clone` command:
+
+ - Set up your **Production environment** and have a successful job run.
+ - Enable **Defer to production** by toggling the switch in the lower-right corner of the command bar.
+
+ - Run the `dbt clone` command from the command bar.
+
+
+Check out [this Developer blog post](https://docs.getdbt.com/blog/to-defer-or-to-clone) for more details on best practices when to use `dbt clone` vs. deferral.
diff --git a/website/docs/reference/commands/cmd-docs.md b/website/docs/reference/commands/cmd-docs.md
index 754c5e93baf..bc4840464b8 100644
--- a/website/docs/reference/commands/cmd-docs.md
+++ b/website/docs/reference/commands/cmd-docs.md
@@ -19,6 +19,18 @@ The command is responsible for generating your project's documentation website b
dbt docs generate
```
+
+
+Use the `--select` argument to limit the nodes included within `catalog.json`. When this flag is provided, step (3) will be restricted to the selected nodes. All other nodes will be excluded. Step (2) is unaffected.
+
+**Example**:
+```shell
+dbt docs generate --select +orders
+```
+
+
+
+
Use the `--no-compile` argument to skip re-compilation. When this flag is provided, `dbt docs generate` will skip step (2) described above.
**Example**:
diff --git a/website/docs/reference/commands/compile.md b/website/docs/reference/commands/compile.md
index a9821c0ff12..cde65b7c6b6 100644
--- a/website/docs/reference/commands/compile.md
+++ b/website/docs/reference/commands/compile.md
@@ -29,7 +29,7 @@ This will log the compiled SQL to the terminal, in addition to writing to the `t
For example:
```bash
-dbt compile --select stg_payments
+dbt compile --select "stg_payments"
dbt compile --inline "select * from {{ ref('raw_orders') }}"
```
@@ -37,7 +37,7 @@ returns the following:
```bash
-dbt compile --select stg_orders
+dbt compile --select "stg_orders"
21:17:09 Running with dbt=1.5.0-b5
21:17:09 Found 5 models, 20 tests, 0 snapshots, 0 analyses, 425 macros, 0 operations, 3 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups
21:17:09
@@ -67,8 +67,8 @@ select * from renamed
The command accesses the data platform to cache-related metadata, and to run introspective queries. Use the flags:
-- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query.
-- `--no-introspect` to disable [introspective queries](/faqs/warehouse/db-connection-dbt-compile#introspective-queries). dbt will raise an error if a model's definition requires running one.
+- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query. This is a `dbt` flag, which means you need to add `dbt` as a prefix. For example: `dbt --no-populate-cache`.
+- `--no-introspect` to disable [introspective queries](/faqs/warehouse/db-connection-dbt-compile#introspective-queries). dbt will raise an error if a model's definition requires running one. This is a `dbt compile` flag, which means you need to add `dbt compile` as a prefix. For example:`dbt compile --no-introspect`.
### FAQs
diff --git a/website/docs/reference/commands/deps.md b/website/docs/reference/commands/deps.md
index 4c7a36606e2..60ccd091ad7 100644
--- a/website/docs/reference/commands/deps.md
+++ b/website/docs/reference/commands/deps.md
@@ -57,3 +57,31 @@ Installing calogica/dbt_date@0.4.0
Updates available for packages: ['tailsdotcom/dbt_artifacts', 'dbt-labs/snowplow']
Update your versions in packages.yml, then run dbt deps
```
+
+
+
+dbt generates the `package-lock.yml` file in the _project_root_ where `packages.yml` is recorded, which contains all the resolved packages, the first time you run `dbt deps`. Each subsequent run records the packages installed in this file. If the subsequent `dbt deps` runs contain no updated packages in `dependencies.yml` or `packages.yml`, dbt-core installs from `package-lock.yml`.
+
+When you update the package spec and run `dbt deps` again, the package-lock and package files update accordingly. You can run `dbt deps --lock` to update the `package-lock.yml` with the most recent dependencies from `packages`.
+
+The `--add-package` flag allows you to add a package to the `packages.yml` with configurable `--version` and `--source` information. The `--dry-run` flag, when set to `False`(default), recompiles the `package-lock.yml` file after a new package is added to the `packages.yml` file. Set the flag to `True` for the changes to not persist.
+
+Examples of the `--add-package` flag:
+```shell
+# add package from hub (--source arg defaults to "hub")
+dbt deps --add-package dbt-labs/dbt_utils@1.0.0
+
+# add package from hub with semantic version range
+dbt deps --add-package dbt-labs/snowplow@">=0.7.0,<0.8.0"
+
+# add package from git
+dbt deps --add-package https://github.com/fivetran/dbt_amplitude@v0.3.0 --source git
+
+# add package from local
+dbt deps --add-package /opt/dbt/redshift --source local
+
+# add package to packages.yml and package-lock.yml WITHOUT actually installing dependencies
+dbt deps --add-package dbt-labs/dbt_utils@1.0.0 --dry-run
+
+```
+
diff --git a/website/docs/reference/commands/init.md b/website/docs/reference/commands/init.md
index 468bee5ff60..e9cc2ccba4e 100644
--- a/website/docs/reference/commands/init.md
+++ b/website/docs/reference/commands/init.md
@@ -17,46 +17,28 @@ Then, it will:
- Create a new folder with your project name and sample files, enough to get you started with dbt
- Create a connection profile on your local machine. The default location is `~/.dbt/profiles.yml`. Read more in [configuring your profile](/docs/core/connect-data-platform/connection-profiles).
-## Existing project
+
-If you've just cloned or downloaded an existing dbt project, `dbt init` can still help you set up your connection profile so that you can start working quickly. It will prompt you for connection information, as above, and add a profile (using the `profile` name from the project) to your local `profiles.yml`, or create the file if it doesn't already exist.
+When using `dbt init` to initialize your project, include the `--profile` flag to specify an existing `profiles.yml` as the `profile:` key to use instead of creating a new one. For example, `dbt init --profile`.
-## profile_template.yml
-`dbt init` knows how to prompt for connection information by looking for a file named `profile_template.yml`. It will look for this file in two places:
-- **Adapter plugin:** What's the bare minumum Postgres profile? What's the type of each field, what are its defaults? This information is stored in a file called [`dbt/include/postgres/profile_template.yml`](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml). If you're the maintainer of an adapter plugin, we highly recommend that you add a `profile_template.yml` to your plugin, too. See more details in [building-a-new-adapter](/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter).
+If the profile does not exist in `profiles.yml` or the command is run inside an existing project, the command raises an error.
-- **Existing project:** If you're the maintainer of an existing project, and you want to help new users get connected to your database quickly and easily, you can include your own custom `profile_template.yml` in the root of your project, alongside `dbt_project.yml`. For common connection attributes, set the values in `fixed`; leave user-specific attributes in `prompts`, but with custom hints and defaults as you'd like.
+
-
+## Existing project
-
+If you've just cloned or downloaded an existing dbt project, `dbt init` can still help you set up your connection profile so that you can start working quickly. It will prompt you for connection information, as above, and add a profile (using the `profile` name from the project) to your local `profiles.yml`, or create the file if it doesn't already exist.
-```yml
-fixed:
- account: abc123
- authenticator: externalbrowser
- database: analytics
- role: transformer
- type: snowflake
- warehouse: transforming
-prompts:
- user:
- type: string
- hint: yourname@jaffleshop.com
- schema:
- type: string
- hint: usually dbt_
- threads:
- hint: "your favorite number, 1-10"
- type: int
- default: 8
-```
-
+## profile_template.yml
-
+`dbt init` knows how to prompt for connection information by looking for a file named `profile_template.yml`. It will look for this file in two places:
+
+- **Adapter plugin:** What's the bare minumum Postgres profile? What's the type of each field, what are its defaults? This information is stored in a file called [`dbt/include/postgres/profile_template.yml`](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml). If you're the maintainer of an adapter plugin, we highly recommend that you add a `profile_template.yml` to your plugin, too. Refer to the [Build, test, document, and promote adapters](/guides/adapter-creation) guide for more information.
+
+- **Existing project:** If you're the maintainer of an existing project, and you want to help new users get connected to your database quickly and easily, you can include your own custom `profile_template.yml` in the root of your project, alongside `dbt_project.yml`. For common connection attributes, set the values in `fixed`; leave user-specific attributes in `prompts`, but with custom hints and defaults as you'd like.
diff --git a/website/docs/reference/commands/list.md b/website/docs/reference/commands/list.md
index 6084b3dec70..5caabdc2b2e 100644
--- a/website/docs/reference/commands/list.md
+++ b/website/docs/reference/commands/list.md
@@ -8,9 +8,10 @@ id: "list"
The `dbt ls` command lists resources in your dbt project. It accepts selector arguments that are similar to those provided in [dbt run](/reference/commands/run). `dbt list` is an alias for `dbt ls`. While `dbt ls` will read your [connection profile](/docs/core/connect-data-platform/connection-profiles) to resolve [`target`](/reference/dbt-jinja-functions/target)-specific logic, this command will not connect to your database or run any queries.
### Usage
+
```
dbt ls
- [--resource-type {model,source,seed,snapshot,metric,test,exposure,analysis,default,all}]
+ [--resource-type {model,semantic_model,source,seed,snapshot,metric,test,exposure,analysis,default,all}]
[--select SELECTION_ARG [SELECTION_ARG ...]]
[--models SELECTOR [SELECTOR ...]]
[--exclude SELECTOR [SELECTOR ...]]
@@ -85,7 +86,7 @@ $ dbt ls --select snowplow.* --output json --output-keys "name resource_type des
```
-$ dbt ls --select snowplow.* --output json --output-keys name resource_type description
+$ dbt ls --select snowplow.* --output json --output-keys "name resource_type description"
{"name": "snowplow_events", "description": "This is a pretty cool model", ...}
{"name": "snowplow_page_views", "description": "This model is even cooler", ...}
...
@@ -93,6 +94,16 @@ $ dbt ls --select snowplow.* --output json --output-keys name resource_type desc
+
+
+**Listing Semantic models**
+
+List all resources upstream of your orders semantic model:
+```
+dbt ls -s +semantic_model:orders
+```
+
+
**Listing file paths**
```
diff --git a/website/docs/reference/commands/retry.md b/website/docs/reference/commands/retry.md
index 0c010ede2c1..8da5d5a77a6 100644
--- a/website/docs/reference/commands/retry.md
+++ b/website/docs/reference/commands/retry.md
@@ -20,3 +20,80 @@ Retry works with the following commands:
`dbt retry` reuses the [selectors](/reference/node-selection/yaml-selectors) from the previously executed command.
+
+Example results of executing `dbt retry` after a successful `dbt run`:
+
+```shell
+Running with dbt=1.6.1
+Registered adapter: duckdb=1.6.0
+Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models
+
+Nothing to do. Try checking your model configs and model specification args
+```
+
+Example of when `dbt run` encounters a syntax error in a model:
+
+```shell
+Running with dbt=1.6.1
+Registered adapter: duckdb=1.6.0
+Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models
+
+Concurrency: 24 threads (target='dev')
+
+1 of 5 START sql view model main.stg_customers ................................. [RUN]
+2 of 5 START sql view model main.stg_orders .................................... [RUN]
+3 of 5 START sql view model main.stg_payments .................................. [RUN]
+1 of 5 OK created sql view model main.stg_customers ............................ [OK in 0.06s]
+2 of 5 OK created sql view model main.stg_orders ............................... [OK in 0.06s]
+3 of 5 OK created sql view model main.stg_payments ............................. [OK in 0.07s]
+4 of 5 START sql table model main.customers .................................... [RUN]
+5 of 5 START sql table model main.orders ....................................... [RUN]
+4 of 5 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s]
+5 of 5 OK created sql table model main.orders .................................. [OK in 0.04s]
+
+Finished running 3 view models, 2 table models in 0 hours 0 minutes and 0.15 seconds (0.15s).
+
+Completed with 1 error and 0 warnings:
+
+Runtime Error in model customers (models/customers.sql)
+ Parser Error: syntax error at or near "selct"
+
+Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5
+```
+
+
+Example of a subsequent failed `dbt retry` run without fixing the error(s):
+
+```shell
+Running with dbt=1.6.1
+Registered adapter: duckdb=1.6.0
+Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models
+
+Concurrency: 24 threads (target='dev')
+
+1 of 1 START sql table model main.customers .................................... [RUN]
+1 of 1 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s]
+
+Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5
+```
+
+Example of a successful `dbt retry` run after fixing error(s):
+
+```shell
+Running with dbt=1.6.1
+Registered adapter: duckdb=1.6.0
+Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models
+
+Concurrency: 24 threads (target='dev')
+
+1 of 1 START sql table model main.customers .................................... [RUN]
+1 of 1 OK created sql table model main.customers ............................... [OK in 0.05s]
+
+Finished running 1 table model in 0 hours 0 minutes and 0.09 seconds (0.09s).
+
+Completed successfully
+
+Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
+```
+
+In each scenario `dbt retry` picks up from the error rather than running all of the upstream dependencies again.
diff --git a/website/docs/reference/commands/rpc.md b/website/docs/reference/commands/rpc.md
index a98799356ee..809eadee639 100644
--- a/website/docs/reference/commands/rpc.md
+++ b/website/docs/reference/commands/rpc.md
@@ -5,22 +5,18 @@ id: "rpc"
description: "Remote Procedure Call (rpc) dbt server compiles and runs queries, and provides methods that enable you to list and terminate running processes. "
---
-
+:::caution The dbt-rpc plugin is deprecated
- - **v0.14**: The `dbt rpc` command was introduced to dbt Core
- - **v1.0**: We now distribute and package the Remote Procedure Call (rpc) server functionality separately from `dbt-core`. You can find the code in a dedicated [`dbt-rpc` repository](https://github.com/dbt-labs/dbt-rpc).
-
+dbt Labs actively maintained `dbt-rpc` for compatibility with dbt-core versions up to v1.5. Starting with dbt-core v1.6 (released in July 2023), `dbt-rpc` is no longer supported for ongoing compatibility.
-### Overview
+In the meantime, dbt Labs will be performing critical maintenance only for `dbt-rpc`, until the last compatible version of dbt-core has reached the [end of official support](/docs/dbt-versions/core#latest-releases). At that point, dbt Labs will archive this repository to be read-only.
-You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context.
+:::
-:::caution Deprecation
-**The dbt-rpc plugin will be fully deprecated by the second half of 2023.**
+### Overview
-dbt Labs is actively maintaining `dbt-rpc` up to dbt v1.4. Starting in v1.5, we intend to break `dbt-rpc` compatibility in favor of [the new dbt Server](https://github.com/dbt-labs/dbt-server). dbt Labs will perform critical maintenance only on `dbt-rpc`, until the last compatible version of dbt has reached the end of official support (thus 12 months after release of v1.4; [see Core version policies](/docs/dbt-versions/core)).
-:::
+You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context.
:::caution Running on Windows
We do not recommend running the rpc server on Windows because of reliability issues. A Docker container may provide a useful workaround, if required.
diff --git a/website/docs/reference/commands/run.md b/website/docs/reference/commands/run.md
index fbc1a513cb1..557d0d71338 100644
--- a/website/docs/reference/commands/run.md
+++ b/website/docs/reference/commands/run.md
@@ -71,32 +71,12 @@ For more information on running parents or children of specific models, see the
## Treat warnings as errors
-
-
-- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0
-
-
-
-See [global configs](/reference/global-configs/failing-fast)
+See [global configs](/reference/global-configs/warnings)
## Failing fast
-
-
-- The `--fail-fast` flag is new in dbt v0.17.0
-- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0
-
-
-
See [global configs](/reference/global-configs/failing-fast)
## Enable or Disable Colorized Logs
-
-
-- The `--use-colors` and `--no-use-colors` flags are new in dbt v0.18.0
-- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0
-
-
-
See [global configs](/reference/global-configs/print-output#print-color)
diff --git a/website/docs/reference/commands/seed.md b/website/docs/reference/commands/seed.md
index 272a2a7f2a9..d0cd199ea12 100644
--- a/website/docs/reference/commands/seed.md
+++ b/website/docs/reference/commands/seed.md
@@ -4,24 +4,15 @@ sidebar_label: "seed"
id: "seed"
---
-
-
-- **v1.0.0:** The default config for this command will now be `seed-paths` instead of `data-paths`.
-
-
-
-
The `dbt seed` command will load `csv` files located in the `seed-paths` directory of your dbt project into your .
### Selecting seeds to run
- Added in v0.16.0
-
Specific seeds can be run using the `--select` flag to `dbt seed`. Example:
```
-$ dbt seed --select country_codes
+$ dbt seed --select "country_codes"
Found 2 models, 3 tests, 0 archives, 0 analyses, 53 macros, 0 operations, 2 seed files
14:46:15 | Concurrency: 1 threads (target='dev')
diff --git a/website/docs/reference/commands/show.md b/website/docs/reference/commands/show.md
index 5bdcfacc1e8..a0e5d68c83f 100644
--- a/website/docs/reference/commands/show.md
+++ b/website/docs/reference/commands/show.md
@@ -16,7 +16,7 @@ The results of the preview query are not materialized in the data warehouse, or
Example:
```
-dbt show --select model_name.sql
+dbt show --select "model_name.sql"
```
or
```
@@ -26,7 +26,7 @@ dbt show --inline "select * from {{ ref('model_name') }}"
The following is an example of `dbt show` output for a model named `stg_orders`:
```bash
-dbt show --select stg_orders
+dbt show --select "stg_orders"
21:17:38 Running with dbt=1.5.0-b5
21:17:38 Found 5 models, 20 tests, 0 snapshots, 0 analyses, 425 macros, 0 operations, 3 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups
21:17:38
@@ -46,7 +46,7 @@ dbt show --select stg_orders
For example, if you've just built a model that has a failing test, you can quickly preview the test failures right in the terminal, to find values of `id` that are duplicated:
```bash
-$ dbt build -s my_model_with_duplicates
+$ dbt build -s "my_model_with_duplicates"
13:22:47 Running with dbt=1.5.0
...
13:22:48 Completed with 1 error and 0 warnings:
@@ -58,7 +58,7 @@ $ dbt build -s my_model_with_duplicates
13:22:48
13:22:48 Done. PASS=1 WARN=0 ERROR=1 SKIP=0 TOTAL=2
-$ dbt show -s unique_my_model_with_duplicates_id
+$ dbt show -s "unique_my_model_with_duplicates_id"
13:22:53 Running with dbt=1.5.0
13:22:53 Found 4 models, 2 tests, 0 snapshots, 0 analyses, 309 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups
13:22:53
diff --git a/website/docs/reference/commands/source.md b/website/docs/reference/commands/source.md
index b29bf7dadc6..697ae2b5fcc 100644
--- a/website/docs/reference/commands/source.md
+++ b/website/docs/reference/commands/source.md
@@ -20,10 +20,10 @@ By default, `dbt source freshness` will calculate freshness information for all
```bash
# Snapshot freshness for all Snowplow tables:
-$ dbt source freshness --select source:snowplow
+$ dbt source freshness --select "source:snowplow"
# Snapshot freshness for a particular source table:
-$ dbt source freshness --select source:snowplow.event
+$ dbt source freshness --select "source:snowplow.event"
```
### Configuring source freshness output
diff --git a/website/docs/reference/commands/test.md b/website/docs/reference/commands/test.md
index a1a63729568..c050d82a0ab 100644
--- a/website/docs/reference/commands/test.md
+++ b/website/docs/reference/commands/test.md
@@ -10,22 +10,22 @@ The tests to run can be selected using the `--select` flag discussed [here](/ref
```bash
# run tests for one_specific_model
-dbt test --select one_specific_model
+dbt test --select "one_specific_model"
# run tests for all models in package
-dbt test --select some_package.*
+dbt test --select "some_package.*"
# run only tests defined singularly
-dbt test --select test_type:singular
+dbt test --select "test_type:singular"
# run only tests defined generically
-dbt test --select test_type:generic
+dbt test --select "test_type:generic"
# run singular tests limited to one_specific_model
-dbt test --select one_specific_model,test_type:singular
+dbt test --select "one_specific_model,test_type:singular"
# run generic tests limited to one_specific_model
-dbt test --select one_specific_model,test_type:generic
+dbt test --select "one_specific_model,test_type:generic"
```
For more information on writing tests, see the [Testing Documentation](/docs/build/tests).
diff --git a/website/docs/reference/configs-and-properties.md b/website/docs/reference/configs-and-properties.md
index c2ad5b77629..c6458babeaa 100644
--- a/website/docs/reference/configs-and-properties.md
+++ b/website/docs/reference/configs-and-properties.md
@@ -11,7 +11,7 @@ A rule of thumb: properties declare things _about_ your project resources; confi
For example, you can use resource **properties** to:
* Describe models, snapshots, seed files, and their columns
-- Assert "truths" about a model, in the form of [tests](/docs/build/tests), e.g. "this `id` column is unique"
+* Assert "truths" about a model, in the form of [tests](/docs/build/tests), e.g. "this `id` column is unique"
* Define pointers to existing tables that contain raw data, in the form of [sources](/docs/build/sources), and assert the expected "freshness" of this raw data
* Define official downstream uses of your data models, in the form of [exposures](/docs/build/exposures)
@@ -35,11 +35,11 @@ dbt prioritizes configurations in order of specificity, from most specificity to
Note - Generic tests work a little differently when it comes to specificity. See [test configs](/reference/test-configs).
-Within the project file, configurations are also applied hierarchically. The most-specific config always "wins": In the project file, configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project. To apply a configuration to a model, or directory of models, define the resource path as nested dictionary keys.
+Within the project file, configurations are also applied hierarchically. The most specific config always "wins": In the project file, configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project. To apply a configuration to a model, or directory of models, define the resource path as nested dictionary keys.
### Combining configs
-Most configurations are "clobbered" when applied hierarchically. Whenever a more-specific value is available, it will completely replace the less-specific value. Note that a few configs have different merge behavior:
+Most configurations are "clobbered" when applied hierarchically. Whenever a more specific value is available, it will completely replace the less specific value. Note that a few configs have different merge behavior:
- [`tags`](tags) are additive. If a model has some tags configured in `dbt_project.yml`, and more tags applied in its `.sql` file, the final set of tags will include all of them.
- [`meta`](/reference/resource-configs/meta) dictionaries are merged (a more specific key-value pair replaces a less specific value with the same key)
- [`pre-hook` and `post-hook`](/reference/resource-configs/pre-hook-post-hook) are also additive.
@@ -67,12 +67,14 @@ Previous versions of the docs referred to these as `schema.yml` files — we've
dbt has the ability to define node configs in `.yml` files, in addition to `config()` blocks and `dbt_project.yml`. But the reverse isn't always true: there are some things in `.yml` files that can _only_ be defined there.
Certain properties are special, because:
+
- They have a unique Jinja rendering context
- They create new project resources
- They don't make sense as hierarchical configuration
- They're older properties that haven't yet been redefined as configs
These properties are:
+
- [`description`](/reference/resource-properties/description)
- [`tests`](/reference/resource-properties/tests)
- [`docs`](/reference/resource-configs/docs)
@@ -155,9 +157,9 @@ You can find an exhaustive list of each supported property and config, broken do
* Model [properties](/reference/model-properties) and [configs](/reference/model-configs)
* Source [properties](/reference/source-properties) and [configs](source-configs)
* Seed [properties](/reference/seed-properties) and [configs](/reference/seed-configs)
-* [Snapshot Properties](snapshot-properties)
+* Snapshot [properties](snapshot-properties)
* Analysis [properties](analysis-properties)
-* [Macro Properties](/reference/macro-properties)
+* Macro [properties](/reference/macro-properties)
* Exposure [properties](/reference/exposure-properties)
## FAQs
@@ -202,3 +204,4 @@ Runtime Error
```
This error occurred because a semicolon (`;`) was accidentally used instead of a colon (`:`) after the `description` field. To resolve issues like this, find the `.yml` file referenced in the error message and fix any syntax errors present in the file. There are online YAML validators that can be helpful here, but please be mindful of submitting sensitive information to third-party applications!
+
diff --git a/website/docs/reference/database-permissions/about-database-permissions.md b/website/docs/reference/database-permissions/about-database-permissions.md
new file mode 100644
index 00000000000..76fff517646
--- /dev/null
+++ b/website/docs/reference/database-permissions/about-database-permissions.md
@@ -0,0 +1,36 @@
+---
+title: "Database permissions"
+id: about-database-permissions
+description: "Database permissions are access rights and privileges granted to users or roles within a database management system."
+sidebar_label: "About database permissions"
+pagination_next: "reference/database-permissions/databricks-permissions"
+pagination_prev: null
+---
+
+Database permissions are access rights and privileges granted to users or roles within a database or data platform. They help you specify what actions users or roles can perform on various database objects, like tables, views, schemas, or even the entire database.
+
+
+### Why are they useful
+
+- Database permissions are essential for security and data access control.
+- They ensure that only authorized users can perform specific actions.
+- They help maintain data integrity, prevent unauthorized changes, and limit exposure to sensitive data.
+- Permissions also support compliance with data privacy regulations and auditing.
+
+### How to use them
+
+- Users and administrators can grant and manage permissions at various levels (such as table, schema, and so on) using SQL statements or through the database system's interface.
+- Assign permissions to individual users or roles (groups of users) based on their responsibilities.
+ - Typical permissions include "SELECT" (read), "INSERT" (add data), "UPDATE" (modify data), "DELETE" (remove data), and administrative rights like "CREATE" and "DROP."
+- Users should be assigned permissions that ensure they have the necessary access to perform their tasks without overextending privileges.
+
+Something to note is that each data platform provider might have different approaches and names for privileges. Refer to their documentation for more details.
+
+### Examples
+
+Refer to the following database permission pages for more info on examples and how to set up database permissions:
+
+- [Databricks](/reference/database-permissions/databricks-permissions)
+- [Postgres](/reference/database-permissions/postgres-permissions)
+- [Redshift](/reference/database-permissions/redshift-permissions)
+- [Snowflake](/reference/database-permissions/snowflake-permissions)
diff --git a/website/docs/reference/database-permissions/databricks-permissions.md b/website/docs/reference/database-permissions/databricks-permissions.md
new file mode 100644
index 00000000000..12e24652ae3
--- /dev/null
+++ b/website/docs/reference/database-permissions/databricks-permissions.md
@@ -0,0 +1,20 @@
+---
+title: "Databricks permissions"
+---
+
+In Databricks, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Databricks database.
+
+## Example Databricks permissions
+
+The following example provides you with the SQL statements you can use to manage permissions.
+
+**Note** that you can grant permissions on `securable_objects` to `principals` (This can be user, service principal, or group). For example, `grant privilege_type` on `securable_object` to `principal`.
+
+```
+
+grant all privileges on schema schema_name to principal;
+grant create table on schema schema_name to principal;
+grant create view on schema schema_name to principal;
+```
+
+Check out the [official documentation](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/privileges.html#privilege-types-by-securable-object-in-unity-catalog) for more information.
diff --git a/website/docs/reference/database-permissions/postgres-permissions.md b/website/docs/reference/database-permissions/postgres-permissions.md
new file mode 100644
index 00000000000..da56e9b45f2
--- /dev/null
+++ b/website/docs/reference/database-permissions/postgres-permissions.md
@@ -0,0 +1,25 @@
+---
+title: "Postgres Permissions"
+---
+
+
+In Postgres, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Postgres database.
+
+## Example Postgres permissions
+
+The following example provides you with the SQL statements you can use to manage permissions. These examples allow you to run dbt smoothly without encountering permission issues, such as creating schemas, reading existing data, and accessing the information schema.
+
+**Note** that `database_name`, `database.schema_name`, and `user_name` are placeholders and you can replace them as needed for your organization's naming convention.
+
+```
+grant usage on database database_name to user_name;
+grant create schema on database database_name to user_name;
+grant usage on schema database.schema_name to user_name;
+grant create table on schema database.schema_name to user_name;
+grant create view on schema database.schema_name to user_name;
+grant usage on all schemas in database database_name to user_name;
+grant select on all tables in database database_name to user_name;
+grant select on all views in database database_name to user_name;
+```
+
+Check out the [official documentation](https://www.postgresql.org/docs/current/sql-grant.html) for more information.
diff --git a/website/docs/reference/database-permissions/redshift-permissions.md b/website/docs/reference/database-permissions/redshift-permissions.md
new file mode 100644
index 00000000000..5f0949a3528
--- /dev/null
+++ b/website/docs/reference/database-permissions/redshift-permissions.md
@@ -0,0 +1,25 @@
+---
+title: "Redshift permissions"
+---
+
+In Redshift, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Redshift database.
+
+## Example Redshift permissions
+
+The following example provides you with the SQL statements you can use to manage permissions.
+
+**Note** that `database_name`, `database.schema_name`, and `user_name` are placeholders and you can replace them as needed for your organization's naming convention.
+
+
+```
+grant usage on database database_name to user_name;
+grant create schema on database database_name to user_name;
+grant usage on schema database.schema_name to user_name;
+grant create table on schema database.schema_name to user_name;
+grant create view on schema database.schema_name to user_name;
+grant usage on all schemas in database database_name to user_name;
+grant select on all tables in database database_name to user_name;
+grant select on all views in database database_name to user_name;
+```
+
+Check out the [official documentation](https://docs.aws.amazon.com/redshift/latest/dg/r_GRANT.html) for more information.
diff --git a/website/docs/reference/database-permissions/snowflake-permissions.md b/website/docs/reference/database-permissions/snowflake-permissions.md
new file mode 100644
index 00000000000..3f474242834
--- /dev/null
+++ b/website/docs/reference/database-permissions/snowflake-permissions.md
@@ -0,0 +1,154 @@
+---
+title: "Snowflake permissions"
+---
+
+In Snowflake, permissions are used to control who can perform certain actions on different database objects. Use SQL statements to manage permissions in a Snowflake database.
+
+## Set up Snowflake account
+
+This section explains how to set up permissions and roles within Snowflake. In Snowflake, you would perform these actions using SQL commands and set up your data warehouse and access control within Snowflake's ecosystem.
+
+1. Set up databases
+```
+use role sysadmin;
+create database raw;
+create database analytics;
+```
+2. Set up warehouses
+```
+create warehouse loading
+ warehouse_size = xsmall
+ auto_suspend = 3600
+ auto_resume = false
+ initially_suspended = true;
+
+create warehouse transforming
+ warehouse_size = xsmall
+ auto_suspend = 60
+ auto_resume = true
+ initially_suspended = true;
+
+create warehouse reporting
+ warehouse_size = xsmall
+ auto_suspend = 60
+ auto_resume = true
+ initially_suspended = true;
+```
+
+3. Set up roles and warehouse permissions
+```
+use role securityadmin;
+
+create role loader;
+grant all on warehouse loading to role loader;
+
+create role transformer;
+grant all on warehouse transforming to role transformer;
+
+create role reporter;
+grant all on warehouse reporting to role reporter;
+```
+
+4. Create users, assigning them to their roles
+
+Every person and application gets a separate user and is assigned to the correct role.
+
+```
+create user stitch_user -- or fivetran_user
+ password = '_generate_this_'
+ default_warehouse = loading
+ default_role = loader;
+
+create user claire -- or amy, jeremy, etc.
+ password = '_generate_this_'
+ default_warehouse = transforming
+ default_role = transformer
+ must_change_password = true;
+
+create user dbt_cloud_user
+ password = '_generate_this_'
+ default_warehouse = transforming
+ default_role = transformer;
+
+create user looker_user -- or mode_user etc.
+ password = '_generate_this_'
+ default_warehouse = reporting
+ default_role = reporter;
+
+-- then grant these roles to each user
+grant role loader to user stitch_user; -- or fivetran_user
+grant role transformer to user dbt_cloud_user;
+grant role transformer to user claire; -- or amy, jeremy
+grant role reporter to user looker_user; -- or mode_user, periscope_user
+```
+
+5. Let loader load data
+Give the role unilateral permission to operate on the raw database
+```
+use role sysadmin;
+grant all on database raw to role loader;
+```
+
+6. Let transformer transform data
+The transformer role needs to be able to read raw data.
+
+If you do this before you have any data loaded, you can run:
+```
+grant usage on database raw to role transformer;
+grant usage on future schemas in database raw to role transformer;
+grant select on future tables in database raw to role transformer;
+grant select on future views in database raw to role transformer;
+```
+If you already have data loaded in the raw database, make sure also you run the following to update the permissions
+```
+grant usage on all schemas in database raw to role transformer;
+grant select on all tables in database raw to role transformer;
+grant select on all views in database raw to role transformer;
+```
+transformer also needs to be able to create in the analytics database:
+```
+grant all on database analytics to role transformer;
+```
+7. Let reporter read the transformed data
+A previous version of this article recommended this be implemented through hooks in dbt, but this way lets you get away with a one-off statement.
+```
+grant usage on database analytics to role reporter;
+grant usage on future schemas in database analytics to role reporter;
+grant select on future tables in database analytics to role reporter;
+grant select on future views in database analytics to role reporter;
+```
+Again, if you already have data in your analytics database, make sure you run:
+```
+grant usage on all schemas in database analytics to role reporter;
+grant select on all tables in database analytics to role transformer;
+grant select on all views in database analytics to role transformer;
+```
+8. Maintain
+When new users are added, make sure you add them to the right role! Everything else should be inherited automatically thanks to those `future` grants.
+
+For more discussion and legacy information, refer to [this Discourse article](https://discourse.getdbt.com/t/setting-up-snowflake-the-exact-grant-statements-we-run/439).
+
+## Example Snowflake permissions
+
+The following example provides you with the SQL statements you can use to manage permissions.
+
+**Note** that `warehouse_name`, `database_name`, and `role_name` are placeholders and you can replace them as needed for your organization's naming convention.
+
+```
+
+grant all on warehouse warehouse_name to role role_name;
+grant usage on database database_name to role role_name;
+grant create schema on database database_name to role role_name;
+grant usage on schema database.an_existing_schema to role role_name;
+grant create table on schema database.an_existing_schema to role role_name;
+grant create view on schema database.an_existing_schema to role role_name;
+grant usage on future schemas in database database_name to role role_name;
+grant monitor on future schemas in database database_name to role role_name;
+grant select on future tables in database database_name to role role_name;
+grant select on future views in database database_name to role role_name;
+grant usage on all schemas in database database_name to role role_name;
+grant monitor on all schemas in database database_name to role role_name;
+grant select on all tables in database database_name to role role_name;
+grant select on all views in database database_name to role role_name;
+```
+
diff --git a/website/docs/reference/dbt-classes.md b/website/docs/reference/dbt-classes.md
index 18569fce3b0..13f9263e545 100644
--- a/website/docs/reference/dbt-classes.md
+++ b/website/docs/reference/dbt-classes.md
@@ -10,6 +10,7 @@ These classes are often useful when building advanced dbt models and macros.
The `Relation` object is used to interpolate schema and names into SQL code with appropriate quoting. This object should _always_ be used instead of interpolating values with `{{ schema }}.{{ table }}` directly. Quoting of the Relation object can be configured using the [`quoting` config](/reference/project-configs/quoting).
+
### Creating relations
A `Relation` can be created by calling the `create` class method on the `Relation` class.
@@ -32,6 +33,7 @@ class Relation:
### Using relations
+In addition to `api.Relation.create`, dbt returns a Relation when you use [`ref`](/reference/dbt-jinja-functions/ref), [`source`](/reference/dbt-jinja-functions/source) or [`this`](/reference/dbt-jinja-functions/this).
```jinja2
@@ -84,6 +86,7 @@ col = Column('name', 'varchar', 255)
col.is_string() # True
col.is_numeric() # False
col.is_number() # False
+col.is_integer() # False
col.is_float() # False
col.string_type() # character varying(255)
col.numeric_type('numeric', 12, 4) # numeric(12,4)
@@ -101,15 +104,10 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4)
### Instance methods
-
-
- The `is_number` and `is_float` instance methods were added dbt v0.16.0
-
-
-
- **is_string()**: Returns True if the column is a String type (eg. text, varchar), else False
- **is_numeric()**: Returns True if the column is a fixed-precision Numeric type (eg. `numeric`), else False
- **is_number()**: Returns True if the column is a number-y type (eg. `numeric`, `int`, `float`, or similar), else False
+- **is_integer()**: Returns True if the column is an integer (eg. `int`, `bigint`, `serial` or similar), else False
- **is_float()**: Returns True if the column is a float type (eg. `float`, `float64`, or similar), else False
- **string_size()**: Returns the width of the column if it is a string type, else, an exception is raised
@@ -134,6 +132,9 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4)
-- Return true if the column is a number
{{ string_column.is_number() }}
+-- Return true if the column is an integer
+{{ string_column.is_integer() }}
+
-- Return true if the column is a float
{{ string_column.is_float() }}
@@ -149,6 +150,9 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4)
-- Return true if the column is a number
{{ numeric_column.is_number() }}
+-- Return true if the column is an integer
+{{ numeric_column.is_integer() }}
+
-- Return true if the column is a float
{{ numeric_column.is_float() }}
@@ -184,12 +188,6 @@ will be expanded to:
## Result objects
-
-
-* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification.
-
-
-
The execution of a resource in dbt generates a `Result` object. This object contains information about the executed node, timing, status, and metadata returned by the adapter. At the end of an invocation, dbt records these objects in [`run_results.json`](/reference/artifacts/run-results-json).
- `node`: Full object representation of the dbt resource (model, seed, snapshot, test) executed, including its `unique_id`
@@ -197,7 +195,6 @@ The execution of a resource in dbt generates a `Result` object. This object cont
- `thread_id`: Which thread executed this node? E.g. `Thread-1`
- `execution_time`: Total time spent executing this node, measured in seconds.
- `timing`: Array that breaks down execution time into steps (often `compile` + `execute`)
-- `adapter_response`: Dictionary of metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc.
- `message`: How dbt will report this result on the CLI, based on information returned from the database
import RowsAffected from '/snippets/_run-result.md';
diff --git a/website/docs/reference/dbt-commands.md b/website/docs/reference/dbt-commands.md
index 5b37f13a3fb..d5f0bfcd2ad 100644
--- a/website/docs/reference/dbt-commands.md
+++ b/website/docs/reference/dbt-commands.md
@@ -2,29 +2,63 @@
title: "dbt Command reference"
---
-dbt is typically run one of two ways:
-* In [dbt Cloud](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud)
-* On the [command line interface](/docs/core/about-the-cli) (CLI)
+You can run dbt using the following tools:
-The following sections outline the commands supported by dbt and their relevant flags. Note that some commands are only supported when using the CLI.
+- In your browser with the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud)
+- On the command line interface using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) or open-source [dbt Core](/docs/core/about-dbt-core), both of which enable you to execute dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features).
-For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax).
+The following sections outline the commands supported by dbt and their relevant flags. For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax).
### Available commands
-Select the tabs that are relevant to the your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**.
+
+
+All commands in the table are compatible with either the dbt Cloud IDE, dbt Cloud CLI, or dbt Core.
+
+You can run dbt commands in your specific tool by prefixing them with `dbt`. For example, to run the `test` command, type `dbt test`.
+
+| Command | Description | Compatible tools | Version |
+| ------- | ----------- | ---------------- | ------- |
+| [build](/reference/commands/build) | Build and test all selected resources (models, seeds, snapshots, tests) | All | All [supported versions](/docs/dbt-versions/core) |
+| cancel | Cancels the most recent invocation.| dbt Cloud CLI | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) |
+| [clean](/reference/commands/clean) | Deletes artifacts present in the dbt project | All | All [supported versions](/docs/dbt-versions/core) |
+| [clone](/reference/commands/clone) | Clone selected models from the specified state | All | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) |
+| [compile](/reference/commands/compile) | Compiles (but does not run) the models in a project | All | All [supported versions](/docs/dbt-versions/core) |
+| [debug](/reference/commands/debug) | Debugs dbt connections and projects | dbt Cloud IDE
dbt Core | All [supported versions](/docs/dbt-versions/core) |
+| [deps](/reference/commands/deps) | Downloads dependencies for a project | All | All [supported versions](/docs/dbt-versions/core) |
+| [docs](/reference/commands/cmd-docs) | Generates documentation for a project | All | All [supported versions](/docs/dbt-versions/core) |
+| help | Displays help information for any command | dbt Core
dbt Cloud CLI | All [supported versions](/docs/dbt-versions/core) |
+| [init](/reference/commands/init) | Initializes a new dbt project | dbt Core | All [supported versions](/docs/dbt-versions/core) |
+| [list](/reference/commands/list) | Lists resources defined in a dbt project | All | All [supported versions](/docs/dbt-versions/core) |
+| [parse](/reference/commands/parse) | Parses a project and writes detailed timing info | All | All [supported versions](/docs/dbt-versions/core) |
+| reattach | Reattaches to the most recent invocation to retrieve logs and artifacts. | dbt Cloud CLI | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) |
+| [retry](/reference/commands/retry) | Retry the last run `dbt` command from the point of failure | All | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) |
+| [run](/reference/commands/run) | Runs the models in a project | All | All [supported versions](/docs/dbt-versions/core) |
+| [run-operation](/reference/commands/run-operation) | Invoke a macro, including running arbitrary maintenance SQL against the database | All | All [supported versions](/docs/dbt-versions/core) |
+| [seed](/reference/commands/seed) | Loads CSV files into the database | All | All [supported versions](/docs/dbt-versions/core) |
+| [show](/reference/commands/show) | Preview table rows post-transformation | All | All [supported versions](/docs/dbt-versions/core) |
+| [snapshot](/reference/commands/snapshot) | Executes "snapshot" jobs defined in a project | All | All [supported versions](/docs/dbt-versions/core) |
+| [source](/reference/commands/source) | Provides tools for working with source data (including validating that sources are "fresh") | All | All [supported versions](/docs/dbt-versions/core) |
+| [test](/reference/commands/test) | Executes tests defined in a project | All | All [supported versions](/docs/dbt-versions/core) |
+
+
+
+
+
+
+Select the tabs that are relevant to your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**.
-
+
Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`.
- [build](/reference/commands/build): build and test all selected resources (models, seeds, snapshots, tests)
-- [clone](/reference/commands/clone): clone selected nodes from specified state (requires dbt 1.6 or higher)
+- [clone](/reference/commands/clone): clone selected nodes from the specified state (requires dbt 1.6 or higher)
- [compile](/reference/commands/compile): compiles (but does not run) the models in a project
- [deps](/reference/commands/deps): downloads dependencies for a project
- [docs](/reference/commands/cmd-docs) : generates documentation for a project
-- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or higher)
+- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or later)
- [run](/reference/commands/run): runs the models in a project
- [run-operation](/reference/commands/run-operation): invoke a macro, including running arbitrary maintenance SQL against the database
- [seed](/reference/commands/seed): loads CSV files into the database
@@ -35,13 +69,13 @@ Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/
-
+
-Use the following dbt commands in the [CLI](/docs/core/about-the-cli) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`.
+Use the following dbt commands in [dbt Core](/docs/core/about-dbt-core) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`.
- [build](/reference/commands/build): build and test all selected resources (models, seeds, snapshots, tests)
- [clean](/reference/commands/clean): deletes artifacts present in the dbt project
-- [clone](/reference/commands/clone): clone selected models from specified state (requires dbt 1.6 or higher)
+- [clone](/reference/commands/clone): clone selected models from the specified state (requires dbt 1.6 or higher)
- [compile](/reference/commands/compile): compiles (but does not run) the models in a project
- [debug](/reference/commands/debug): debugs dbt connections and projects
- [deps](/reference/commands/deps): downloads dependencies for a project
@@ -62,27 +96,4 @@ Use the following dbt commands in the [CLI](/docs/core/about-the-cli) and use th
-
-
-
+
diff --git a/website/docs/reference/dbt-jinja-functions/as_bool.md b/website/docs/reference/dbt-jinja-functions/as_bool.md
index e0700032212..d4c2bbf1743 100644
--- a/website/docs/reference/dbt-jinja-functions/as_bool.md
+++ b/website/docs/reference/dbt-jinja-functions/as_bool.md
@@ -24,10 +24,3 @@ models:
```
-
-
-
-* `v0.17.1`: Native rendering is disabled by default. The `as_bool` filter was
-introduced.
-
-
diff --git a/website/docs/reference/dbt-jinja-functions/as_native.md b/website/docs/reference/dbt-jinja-functions/as_native.md
index fca25249dca..1de9ad45bf9 100644
--- a/website/docs/reference/dbt-jinja-functions/as_native.md
+++ b/website/docs/reference/dbt-jinja-functions/as_native.md
@@ -16,10 +16,3 @@ and [`as_number`](/reference/dbt-jinja-functions/as_number) instead.
Unlike `as_bool` and `as_number`, `as_native` will return a rendered value
regardless of the input type. Ensure that your inputs match expectations.
:::
-
-
-
-* `v0.17.1`: Native rendering is disabled by default. The `as_native` filter was
-introduced.
-
-
diff --git a/website/docs/reference/dbt-jinja-functions/as_number.md b/website/docs/reference/dbt-jinja-functions/as_number.md
index 057d7ec8d20..29b35094880 100644
--- a/website/docs/reference/dbt-jinja-functions/as_number.md
+++ b/website/docs/reference/dbt-jinja-functions/as_number.md
@@ -25,10 +25,3 @@ my_profile:
```
-
-
-
-* `v0.17.1`: Native rendering is disabled by default. The `as_number` filter was
-introduced.
-
-
diff --git a/website/docs/reference/dbt-jinja-functions/as_text.md b/website/docs/reference/dbt-jinja-functions/as_text.md
index 5e19e5bc9bc..6b26cfa327d 100644
--- a/website/docs/reference/dbt-jinja-functions/as_text.md
+++ b/website/docs/reference/dbt-jinja-functions/as_text.md
@@ -56,12 +56,3 @@ models:
```
-
-
-
-* `v0.17.0`: Native rendering is enabled by default. The `as_text` filter was
-introduced.
-* `v0.17.1`: Native rendering is disabled by default. The `as_text` filter works
-as before, with no functional effect.
-
-
diff --git a/website/docs/reference/dbt-jinja-functions/builtins.md b/website/docs/reference/dbt-jinja-functions/builtins.md
index 40848705dc4..edc5f34ffda 100644
--- a/website/docs/reference/dbt-jinja-functions/builtins.md
+++ b/website/docs/reference/dbt-jinja-functions/builtins.md
@@ -1,10 +1,11 @@
---
-title: "About builtins Jinja function"
+title: "About builtins Jinja variable"
sidebar_label: "builtins"
id: "builtins"
-description: "Read this guide to understand the builtins Jinja function in dbt."
+description: "Read this guide to understand the builtins Jinja variable in dbt."
---
+
The `builtins` variable exists to provide references to builtin dbt context methods. This allows macros to be created with names that _mask_ dbt builtin context methods, while still making those methods accessible in the dbt compilation context.
The `builtins` variable is a dictionary containing the following keys:
@@ -15,9 +16,51 @@ The `builtins` variable is a dictionary containing the following keys:
## Usage
-The following macro overrides the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`.
+:::important
+
+Using the `builtins` variable in this way is an advanced development workflow. Users should be ready to maintain and update these overrides when upgrading in the future.
+:::
+
+
+
+From dbt v1.5 and higher, use the following macro to extract user-provided arguments, including version
, and call the builtins.ref()
function with either a single modelname
argument or both packagename
and modelname
arguments, based on the number of positional arguments in varargs
:
+
+
+
```
+{% macro ref() %}
+-- extract user-provided positional and keyword arguments
+ {% set version = kwargs.get('version') %}
+ {% set packagename = none %}
+ {%- if (varargs | length) == 1 -%}
+ {% set modelname = varargs[0] %}
+{%- else -%}
+ {% set packagename = varargs[0] %}
+ {% set modelname = varargs[1] %}
+{% endif %}
+-- call builtins.ref based on provided positional arguments
+{% set rel = None %}
+{% if packagename is not none %}
+ {% set rel = return(builtins.ref(packagename, modelname, version=version)) %}
+{% else %}
+ {% set rel = return(builtins.ref(modelname, version=version)) %}
+{% endif %}
+
+-- finally, override the database name with "dev"
+{% set newrel = rel.replace_path(database="dev") %}
+{% do return(newrel) %}
+
+{% endmacro %}
+```
+
+
+
+
+From dbt v1.4 and lower, use the following macro to override the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`:
+
+```
+
{% macro ref(model_name) %}
{% set rel = builtins.ref(model_name) %}
@@ -26,6 +69,7 @@ The following macro overrides the `ref` method available in the model compilatio
{% endmacro %}
```
+
The ref macro can also be used to control which elements of the model path are rendered when run, for example the following macro overrides the `ref` method to render only the schema and object identifier, but not the database reference i.e. `my_schema.my_model` rather than `my_database.my_schema.my_model`. This is especially useful when using snowflake as a warehouse, if you intend to change the name of the database post-build and wish the references to remain accurate.
diff --git a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md
index e0701e5d091..0d377d29cef 100644
--- a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md
+++ b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md
@@ -1,22 +1,23 @@
---
-title: " About dbt_project.yml context variables"
+title: " About dbt_project.yml context"
sidebar_label: "dbt_project.yml context"
id: "dbt-project-yml-context"
-description: "The context variables and methods are available when configuring resources in the dbt_project.yml file."
+description: "The context methods and variables available when configuring resources in the dbt_project.yml file."
---
-The following context variables and methods are available when configuring
+The following context methods and variables are available when configuring
resources in the `dbt_project.yml` file. This applies to the `models:`, `seeds:`,
and `snapshots:` keys in the `dbt_project.yml` file.
+**Available context methods:**
+- [env_var](/reference/dbt-jinja-functions/env_var)
+- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_)
+
**Available context variables:**
- [target](/reference/dbt-jinja-functions/target)
-- [env_var](/reference/dbt-jinja-functions/env_var)
-- [vars](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_)
- [builtins](/reference/dbt-jinja-functions/builtins)
- [dbt_version](/reference/dbt-jinja-functions/dbt_version)
-
### Example configuration
diff --git a/website/docs/reference/dbt-jinja-functions/dispatch.md b/website/docs/reference/dbt-jinja-functions/dispatch.md
index a165ae59eb0..5dff787219f 100644
--- a/website/docs/reference/dbt-jinja-functions/dispatch.md
+++ b/website/docs/reference/dbt-jinja-functions/dispatch.md
@@ -5,12 +5,6 @@ id: "dispatch"
description: "dbt extends functionality across data platforms using multiple dispatch."
---
-
-
-- **v1.0.0:** The 'packages' argument is fully deprecated. Use `macro_namespace` and project-level `dispatch` config instead.
-
-
-
dbt can extend functionality across [Supported Data Platforms](/docs/supported-data-platforms) through a system of [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch). Because SQL syntax, data types, and / support vary across adapters, dbt can define and call generic functional macros, and then "dispatch" that macro to the appropriate implementation for the current adapter.
## Syntax
diff --git a/website/docs/reference/dbt-jinja-functions/env_var.md b/website/docs/reference/dbt-jinja-functions/env_var.md
index a5e9df82415..f4cc05cec0f 100644
--- a/website/docs/reference/dbt-jinja-functions/env_var.md
+++ b/website/docs/reference/dbt-jinja-functions/env_var.md
@@ -58,12 +58,6 @@ models:
### Secrets
-
-
- **v1.0.0:** Restricted use of secret env vars to `profiles.yml` and `packages.yml`
-
-
-
For certain configurations, you can use "secret" env vars. Any env var named with the prefix `DBT_ENV_SECRET_` will be:
- Available for use in `profiles.yml` + `packages.yml`, via the same `env_var()` function
- Disallowed everywhere else, including `dbt_project.yml` and model SQL, to prevent accidentally writing these secret values to the or metadata artifacts
@@ -82,12 +76,6 @@ host: "www.{{ env_var('DBT_ENV_SECRET_HOST_DOMAIN') }}.com/{{ env_var('DBT_ENV_S
### Custom metadata
-
-
- - **v0.19.0:** Introduced `DBT_ENV_CUSTOM_ENV_` prefix and artifact `metadata.env`
-
-
-
Any env var named with the prefix `DBT_ENV_CUSTOM_ENV_` will be included in two places, with its prefix-stripped name as the key:
- [dbt artifacts](/reference/artifacts/dbt-artifacts#common-metadata): `metadata` -> `env`
- [events and structured logs](/reference/events-logging#info-fields): `info` -> `extra`
diff --git a/website/docs/reference/dbt-jinja-functions/graph.md b/website/docs/reference/dbt-jinja-functions/graph.md
index 3b3b4d1cb88..491b7836f45 100644
--- a/website/docs/reference/dbt-jinja-functions/graph.md
+++ b/website/docs/reference/dbt-jinja-functions/graph.md
@@ -99,7 +99,7 @@ representations of those nodes. A simplified example might look like:
},
"exposures": {
"exposure.my_project.traffic_dashboard": {
- "unique_id": "source.my_project.traffic_dashboard",
+ "unique_id": "exposure.my_project.traffic_dashboard",
"type": "dashboard",
"maturity": "high",
"path": "models/path/to/schema.yml",
diff --git a/website/docs/reference/dbt-jinja-functions/log.md b/website/docs/reference/dbt-jinja-functions/log.md
index ec4533ea621..30e68f8c21d 100644
--- a/website/docs/reference/dbt-jinja-functions/log.md
+++ b/website/docs/reference/dbt-jinja-functions/log.md
@@ -12,7 +12,34 @@ __Args__:
Logs a line to either the log file or stdout.
-([Source on GitHub](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/context/base.py#L432))
+
+ Code source
+ Refer to GitHub or the following code as a source:
+
+```python
+ def log(msg: str, info: bool = False) -> str:
+ """Logs a line to either the log file or stdout.
+
+ :param msg: The message to log
+ :param info: If `False`, write to the log file. If `True`, write to
+ both the log file and stdout.
+
+ > macros/my_log_macro.sql
+
+ {% macro some_macro(arg1, arg2) %}
+ {{ log("Running some_macro: " ~ arg1 ~ ", " ~ arg2) }}
+ {% endmacro %}"
+ """
+ if info:
+ fire_event(JinjaLogInfo(msg=msg, node_info=get_node_info()))
+ else:
+ fire_event(JinjaLogDebug(msg=msg, node_info=get_node_info()))
+ return ""
+```
+
+
+
+
```sql
diff --git a/website/docs/reference/dbt-jinja-functions/model.md b/website/docs/reference/dbt-jinja-functions/model.md
index e967debd01f..9ccf0759470 100644
--- a/website/docs/reference/dbt-jinja-functions/model.md
+++ b/website/docs/reference/dbt-jinja-functions/model.md
@@ -52,15 +52,9 @@ To view the structure of `models` and their definitions:
Use the following table to understand how the versioning pattern works and match the Manifest version with the dbt version:
-| dbt version | Manifest version |
-| ----------- | ---------------- |
-| `v1.5` | [Manifest v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html)
-| `v1.4` | [Manifest v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html)
-| `v1.3` | [Manifest v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html)
-| `v1.2` | [Manifest v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html)
-| `v1.1` | [Manifest v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html)
-
+import ManifestVersions from '/snippets/_manifest-versions.md';
+
## Related docs
diff --git a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md
index ff0f7c1ef33..32cd8ca10ff 100644
--- a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md
+++ b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md
@@ -100,12 +100,6 @@ on-run-end:
## Results
-
-
-* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification.
-
-
-
The `results` variable contains a list of [Result objects](/reference/dbt-classes#result-objects) with one element per resource that executed in the dbt job. The Result object provides access within the Jinja on-run-end context to the information that will populate the [run results JSON artifact](/reference/artifacts/run-results-json).
Example usage:
diff --git a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md
index 037a129476e..2a6390c3d12 100644
--- a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md
+++ b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md
@@ -1,16 +1,16 @@
---
-title: "About profiles.yml context variable"
+title: "About profiles.yml context"
sidebar_label: "profiles.yml context"
id: "profiles-yml-context"
-description: "Use these context variables to configure resources in `profiles.yml` file."
+description: "Use these context methods to configure resources in `profiles.yml` file."
---
-The following context variables and methods are available when configuring
+The following context methods are available when configuring
resources in the `profiles.yml` file.
-**Available context variables:**
+**Available context methods:**
- [env_var](/reference/dbt-jinja-functions/env_var)
-- [vars](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_)
+- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_)
### Example usage
diff --git a/website/docs/reference/dbt-jinja-functions/project_name.md b/website/docs/reference/dbt-jinja-functions/project_name.md
index 38717aa16c3..7f76c5a4800 100644
--- a/website/docs/reference/dbt-jinja-functions/project_name.md
+++ b/website/docs/reference/dbt-jinja-functions/project_name.md
@@ -5,8 +5,6 @@ id: "project_name"
description: "Read this guide to understand the project_name Jinja function in dbt."
---
-New in 0.16.0
-
The `project_name` context variable returns the `name` for the root-level project
which is being run by dbt. This variable can be used to defer execution to a
root-level project macro if one exists.
diff --git a/website/docs/reference/dbt-jinja-functions/ref.md b/website/docs/reference/dbt-jinja-functions/ref.md
index c500bb934ab..fda5992e234 100644
--- a/website/docs/reference/dbt-jinja-functions/ref.md
+++ b/website/docs/reference/dbt-jinja-functions/ref.md
@@ -29,11 +29,8 @@ from {{ref('model_a')}}
`ref()` is, under the hood, actually doing two important things. First, it is interpolating the schema into your model file to allow you to change your deployment schema via configuration. Second, it is using these references between models to automatically build the dependency graph. This will enable dbt to deploy models in the correct order when using `dbt run`.
-:::info New in 0.9.0
-
-The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes at the [{{ this }}](/reference/dbt-jinja-functions/this) variable.
-
-:::
+The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes as the [{{ this }} variable](/reference/dbt-jinja-functions/this).
+ - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`.
## Advanced ref usage
@@ -73,7 +70,7 @@ select * from {{ ref('model_name') }}
### Two-argument variant
-There is also a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity.
+You can also use a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity. When using two arguments with projects (not packages), you also need to set [cross project dependencies](/docs/collaborate/govern/project-dependencies).
```sql
select * from {{ ref('project_or_package', 'model_name') }}
diff --git a/website/docs/reference/dbt-jinja-functions/run_query.md b/website/docs/reference/dbt-jinja-functions/run_query.md
index cdd65a7b4dc..87970e024ed 100644
--- a/website/docs/reference/dbt-jinja-functions/run_query.md
+++ b/website/docs/reference/dbt-jinja-functions/run_query.md
@@ -15,7 +15,7 @@ Returns a [Table](https://agate.readthedocs.io/page/api/table.html) object with
**Note:** The `run_query` macro will not begin a transaction automatically - if you wish to run your query inside of a transaction, please use `begin` and `commit ` statements as appropriate.
:::info Using run_query for the first time?
-Check out the section of the Getting Started guide on [using Jinja](/guides/advanced/using-jinja#dynamically-retrieve-the-list-of-payment-methods) for an example of working with the results of the `run_query` macro!
+Check out the section of the Getting Started guide on [using Jinja](/guides/using-jinja#dynamically-retrieve-the-list-of-payment-methods) for an example of working with the results of the `run_query` macro!
:::
**Example Usage:**
diff --git a/website/docs/reference/dbt-jinja-functions/selected_resources.md b/website/docs/reference/dbt-jinja-functions/selected_resources.md
index 80c4250b8d5..a927ec317ae 100644
--- a/website/docs/reference/dbt-jinja-functions/selected_resources.md
+++ b/website/docs/reference/dbt-jinja-functions/selected_resources.md
@@ -30,6 +30,8 @@ For a given run it will look like:
["model.my_project.model1", "model.my_project.model2", "snapshot.my_project.my_snapshot"]
```
+Each value corresponds to a key in the `nodes` object within the [graph](/reference/dbt-jinja-functions/graph) context variable.
+
It can be used in macros in a `pre-hook`, `post-hook`, `on-run-start` or `on-run-end`
to evaluate what nodes are selected and trigger different logic whether a particular node
is selected or not.
diff --git a/website/docs/reference/dbt-jinja-functions/source.md b/website/docs/reference/dbt-jinja-functions/source.md
index 2d73e79f09c..59317a79e3d 100644
--- a/website/docs/reference/dbt-jinja-functions/source.md
+++ b/website/docs/reference/dbt-jinja-functions/source.md
@@ -16,6 +16,7 @@ This function:
- Creates dependencies between a source and the current model, which is useful for documentation and model selection
- Compiles to the full object name in the database
+
## Related guides
- [Using sources](/docs/build/sources)
diff --git a/website/docs/reference/dbt-jinja-functions/statement-blocks.md b/website/docs/reference/dbt-jinja-functions/statement-blocks.md
index 1ad4f099aa3..2829ad3fe14 100644
--- a/website/docs/reference/dbt-jinja-functions/statement-blocks.md
+++ b/website/docs/reference/dbt-jinja-functions/statement-blocks.md
@@ -41,12 +41,6 @@ Once the statement block has executed, the result set is accessible via the `loa
- `data`: Pythonic representation of data returned by query (arrays, tuples, dictionaries).
- `table`: [Agate](https://agate.readthedocs.io/page/api/table.html) table representation of data returned by query.
-
-
-* `v0.19.0`: The `response` structured object replaced a `status` string that contained similar information.
-
-
-
For the above statement, that could look like:
diff --git a/website/docs/reference/dbt-jinja-functions/target.md b/website/docs/reference/dbt-jinja-functions/target.md
index 7d6627c5a4b..e7d08db592f 100644
--- a/website/docs/reference/dbt-jinja-functions/target.md
+++ b/website/docs/reference/dbt-jinja-functions/target.md
@@ -7,7 +7,7 @@ description: "Contains information about your connection to the warehouse."
`target` contains information about your connection to the warehouse.
-* **dbt CLI:** These values are based on the target defined in your [`profiles.yml` file](/docs/core/connect-data-platform/profiles.yml)
+* **dbt Core:** These values are based on the target defined in your [`profiles.yml` file](/docs/core/connect-data-platform/profiles.yml)
* **dbt Cloud Scheduler:**
* `target.name` is defined per job as described [here](/docs/build/custom-target-names).
* For all other attributes, the values are defined by the deployment connection. To check these values, click **Deploy** from the upper left and select **Environments**. Then, select the relevant deployment environment, and click **Settings**.
diff --git a/website/docs/reference/dbt-jinja-functions/this.md b/website/docs/reference/dbt-jinja-functions/this.md
index 9065c660cb0..f9f2961b08f 100644
--- a/website/docs/reference/dbt-jinja-functions/this.md
+++ b/website/docs/reference/dbt-jinja-functions/this.md
@@ -3,13 +3,18 @@ title: "about this"
sidebar_label: "this"
id: "this"
description: "Represents the current model in the database."
+keywords:
+ - relation, relation object, this function, this jinja, this.database, this.schema, this.identifier
+meta:
+ label: 'this'
---
`this` is the database representation of the current model. It is useful when:
- Defining a `where` statement within [incremental models](/docs/build/incremental-models)
- Using [pre or post hooks](/reference/resource-configs/pre-hook-post-hook)
-`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected.
+`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected.
+ - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`.
`this` can be thought of as equivalent to `ref('')`, and is a neat way to avoid circular dependencies.
@@ -17,24 +22,6 @@ description: "Represents the current model in the database."
-
-
-### Grant permissions on a model in a post-hook
-
-
-
-```yaml
-models:
- project-name:
- +post-hook:
- - "grant select on {{ this }} to db_reader"
-```
-
-
-
-
-
-
### Configuring incremental models
@@ -54,3 +41,7 @@ from raw_app_data.events
```
+
+
+
+
\ No newline at end of file
diff --git a/website/docs/reference/dbt_project.yml.md b/website/docs/reference/dbt_project.yml.md
index 59541a81256..34af0f696c7 100644
--- a/website/docs/reference/dbt_project.yml.md
+++ b/website/docs/reference/dbt_project.yml.md
@@ -1,10 +1,5 @@
-
-- **v1.0.0:** The default config name for `data-paths` is now [`seed-paths`](/reference/project-configs/seed-paths), `source-paths` is now [`model-paths`](/reference/project-configs/model-paths) and `modules-path` is now [`packages-install-path`](/reference/project-configs/packages-install-path).
-
-
-
-Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate on your project.
+Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate your project.
@@ -16,6 +11,8 @@ By default, dbt will look for `dbt_project.yml` in your current working director
By default, dbt will look for `dbt_project.yml` in your current working directory and its parents, but you can set a different directory using the `--project-dir` flag or the `DBT_PROJECT_DIR` environment variable.
+Starting from dbt v1.5 and higher, you can specify your dbt Cloud project ID in the `dbt_project.yml` file using `project-id` under the `dbt-cloud` config. To find your project ID, check your dbt Cloud project URL, such as `https://cloud.getdbt.com/11/projects/123456`, where the project ID is `123456`.
+
The following is a list of all available configurations in the `dbt_project.yml` file.
@@ -24,6 +21,9 @@ The following is a list of all available configurations in the `dbt_project.yml`
dbt uses YAML in a few different places. If you're new to YAML, it would be worth taking the time to learn how arrays, dictionaries and strings are represented.
:::
+
+
+
```yml
@@ -53,17 +53,27 @@ dbt uses YAML in a few different places. If you're new to YAML, it would be wort
[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range]
+[dbt-cloud](/docs/cloud/cloud-cli-installation):
+ [project-id](/docs/cloud/configure-cloud-cli#configure-the-dbt-cloud-cli): project_id # Required
+ [defer-env-id](/docs/cloud/about-cloud-develop-defer#defer-in-dbt-cloud-cli): environment_id # Optional
+
[quoting](/reference/project-configs/quoting):
database: true | false
schema: true | false
identifier: true | false
+metrics:
+
+
models:
[](/reference/model-configs)
seeds:
[](/reference/seed-configs)
+semantic-models:
+
+
snapshots:
[](/reference/snapshot-configs)
@@ -84,6 +94,150 @@ vars:
search_order: [packagename]
[restrict-access](/docs/collaborate/govern/model-access): true | false
+
```
+
+
+
+
+
+
+```yml
+[name](/reference/project-configs/name): string
+
+[config-version](/reference/project-configs/config-version): 2
+[version](/reference/project-configs/version): version
+
+[profile](/reference/project-configs/profile): profilename
+
+[model-paths](/reference/project-configs/model-paths): [directorypath]
+[seed-paths](/reference/project-configs/seed-paths): [directorypath]
+[test-paths](/reference/project-configs/test-paths): [directorypath]
+[analysis-paths](/reference/project-configs/analysis-paths): [directorypath]
+[macro-paths](/reference/project-configs/macro-paths): [directorypath]
+[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath]
+[docs-paths](/reference/project-configs/docs-paths): [directorypath]
+[asset-paths](/reference/project-configs/asset-paths): [directorypath]
+
+[target-path](/reference/project-configs/target-path): directorypath
+[log-path](/reference/project-configs/log-path): directorypath
+[packages-install-path](/reference/project-configs/packages-install-path): directorypath
+
+[clean-targets](/reference/project-configs/clean-targets): [directorypath]
+
+[query-comment](/reference/project-configs/query-comment): string
+
+[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range]
+
+[dbt-cloud](/docs/cloud/cloud-cli-installation):
+ [project-id](/docs/cloud/configure-cloud-cli#configure-the-dbt-cloud-cli): project_id # Required
+ [defer-env-id](/docs/cloud/about-cloud-develop-defer#defer-in-dbt-cloud-cli): environment_id # Optional
+
+[quoting](/reference/project-configs/quoting):
+ database: true | false
+ schema: true | false
+ identifier: true | false
+
+models:
+ [](/reference/model-configs)
+
+seeds:
+ [](/reference/seed-configs)
+
+snapshots:
+ [](/reference/snapshot-configs)
+
+sources:
+ [](source-configs)
+
+tests:
+ [](/reference/test-configs)
+
+vars:
+ [](/docs/build/project-variables)
+
+[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement]
+[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement]
+
+[dispatch](/reference/project-configs/dispatch-config):
+ - macro_namespace: packagename
+ search_order: [packagename]
+
+[restrict-access](/docs/collaborate/govern/model-access): true | false
+
+```
+
+
+
+
+
+
+
+
+
+```yml
+[name](/reference/project-configs/name): string
+
+[config-version](/reference/project-configs/config-version): 2
+[version](/reference/project-configs/version): version
+
+[profile](/reference/project-configs/profile): profilename
+
+[model-paths](/reference/project-configs/model-paths): [directorypath]
+[seed-paths](/reference/project-configs/seed-paths): [directorypath]
+[test-paths](/reference/project-configs/test-paths): [directorypath]
+[analysis-paths](/reference/project-configs/analysis-paths): [directorypath]
+[macro-paths](/reference/project-configs/macro-paths): [directorypath]
+[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath]
+[docs-paths](/reference/project-configs/docs-paths): [directorypath]
+[asset-paths](/reference/project-configs/asset-paths): [directorypath]
+
+[target-path](/reference/project-configs/target-path): directorypath
+[log-path](/reference/project-configs/log-path): directorypath
+[packages-install-path](/reference/project-configs/packages-install-path): directorypath
+
+[clean-targets](/reference/project-configs/clean-targets): [directorypath]
+
+[query-comment](/reference/project-configs/query-comment): string
+
+[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range]
+
+[quoting](/reference/project-configs/quoting):
+ database: true | false
+ schema: true | false
+ identifier: true | false
+
+models:
+ [](/reference/model-configs)
+
+seeds:
+ [](/reference/seed-configs)
+
+snapshots:
+ [](/reference/snapshot-configs)
+
+sources:
+ [](source-configs)
+
+tests:
+ [](/reference/test-configs)
+
+vars:
+ [](/docs/build/project-variables)
+
+[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement]
+[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement]
+
+[dispatch](/reference/project-configs/dispatch-config):
+ - macro_namespace: packagename
+ search_order: [packagename]
+
+[restrict-access](/docs/collaborate/govern/model-access): true | false
+
+```
+
+
+
+
diff --git a/website/docs/reference/events-logging.md b/website/docs/reference/events-logging.md
index dec1dafcb8e..ffdeb7bb752 100644
--- a/website/docs/reference/events-logging.md
+++ b/website/docs/reference/events-logging.md
@@ -4,7 +4,7 @@ title: "Events and logs"
As dbt runs, it generates events. The most common way to see those events is as log messages, written in real time to two places:
- The command line terminal (`stdout`), to provide interactive feedback while running dbt.
-- The debug log file (`logs/dbt.log`), to enable detailed [debugging of errors](/guides/best-practices/debugging-errors) when they occur. The text-formatted log messages in this file include all `DEBUG`-level events, as well as contextual information, such as log level and thread name. The location of this file can be configured via [the `log_path` config](/reference/project-configs/log-path).
+- The debug log file (`logs/dbt.log`), to enable detailed [debugging of errors](/guides/debug-errors) when they occur. The text-formatted log messages in this file include all `DEBUG`-level events, as well as contextual information, such as log level and thread name. The location of this file can be configured via [the `log_path` config](/reference/project-configs/log-path).
diff --git a/website/docs/reference/exposure-properties.md b/website/docs/reference/exposure-properties.md
index aaed2a20a09..0bd4cf771af 100644
--- a/website/docs/reference/exposure-properties.md
+++ b/website/docs/reference/exposure-properties.md
@@ -8,7 +8,11 @@ description: "Read this guide to understand exposure properties in dbt."
- [Declaring resource properties](/reference/configs-and-properties)
## Overview
-Exposures are defined in `.yml` files nested under an `exposures:` key. You may define `exposures` in YAML files that also define define `sources` or `models`.
+
+import PropsCallout from '/snippets/_config-prop-callout.md';
+
+Exposures are defined in `properties.yml` files nested under an `exposures:` key. You may define `exposures` in YAML files that also define `sources` or `models`.
+
You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `models/` directory.
diff --git a/website/docs/reference/global-configs/about-global-configs.md b/website/docs/reference/global-configs/about-global-configs.md
index 42819cdac8f..9d1691812b5 100644
--- a/website/docs/reference/global-configs/about-global-configs.md
+++ b/website/docs/reference/global-configs/about-global-configs.md
@@ -8,4 +8,11 @@ Global configs enable you to fine-tune _how_ dbt runs projects on your machine
Global configs control things like the visual output of logs, the manner in which dbt parses your project, and what to do when dbt finds a version mismatch or a failing model. These configs are "global" because they are available for all dbt commands, and because they can be set for all projects running on the same machine or in the same environment.
-Starting in v1.0, you can set global configs in three places. When all three are set, command line flags take precedence, then environment variables, and last yaml configs (usually `profiles.yml`).
\ No newline at end of file
+### Global config precedence
+
+Starting in v1.0, you can set global configs in three places. dbt will evaluate the configs in the following order:
+1. [user config](https://docs.getdbt.com/reference/global-configs/yaml-configurations)
+1. [environment variable](https://docs.getdbt.com/reference/global-configs/environment-variable-configs)
+1. [CLI flag](https://docs.getdbt.com/reference/global-configs/command-line-flags)
+
+Each config is prioritized over the previous one. For example, if all three are provided, then the CLI flag takes precedence.
diff --git a/website/docs/reference/global-configs/cache.md b/website/docs/reference/global-configs/cache.md
index db4eabd14b7..a605e1e70f3 100644
--- a/website/docs/reference/global-configs/cache.md
+++ b/website/docs/reference/global-configs/cache.md
@@ -17,7 +17,7 @@ There are two ways to optionally modify this behavior:
For example, to quickly compile a model that requires no database metadata or introspective queries:
```text
-dbt --skip-populate-cache compile --select my_model_name
+dbt --no-populate-cache compile --select my_model_name
```
@@ -31,7 +31,7 @@ dbt --cache-selected-only run --select salesforce
-
+
### Cache database objects for selected resource
@@ -63,4 +63,4 @@ config:
-
\ No newline at end of file
+
diff --git a/website/docs/reference/global-configs/command-line-flags.md b/website/docs/reference/global-configs/command-line-flags.md
index 6496c92da6d..fbe89ce28f1 100644
--- a/website/docs/reference/global-configs/command-line-flags.md
+++ b/website/docs/reference/global-configs/command-line-flags.md
@@ -4,60 +4,95 @@ id: "command-line-flags"
sidebar: "Command line flags"
---
-Command line (CLI) flags immediately follow `dbt` and precede your subcommand. When set, CLI flags override environment variables and profile configs.
+For consistency, command-line interface (CLI) flags should come right after the `dbt` prefix and its subcommands. This includes "global" flags (supported for all commands). When set, CLI flags override environment variables and profile configs.
-Use this non-boolean config structure, replacing `` with the config you are enabling or disabling, `` with the new setting for the config, and `` with the command this config applies to:
+For example, instead of using:
+
+```bash
+dbt --no-populate-cache run
+```
+
+You should use:
+
+```bash
+dbt run --no-populate-cache
+```
+
+Historically, passing flags (such as "global flags") _before_ the subcommand is a legacy functionality that dbt Labs can remove at any time. We do not support using the same flag before and after the subcommand.
+
+## Using boolean and non-boolean flags
+
+You can construct your commands with boolean flags to enable or disable or with non-boolean flags that use specific values, such as strings.
+
+
+
+
+
+Use this non-boolean config structure:
+- Replacing `` with the command this config applies to.
+- `` with the config you are enabling or disabling, and
+- `` with the new setting for the config.
```text
-$ --=
+ --=
```
-Non-boolean config examples:
+### Example
```text
-dbt --printer-width=80 run
-dbt --indirect-selection=eager test
+dbt run --printer-width=80
+dbt test --indirect-selection=eager
```
-To turn on boolean configs, you would use the `--` CLI flag, and a `--no-` CLI flag to turn off boolean configs, replacing `` with the config you are enabling or disabling and `` with the command this config applies to.
+
+
+
+
+To enable or disable boolean configs:
+- Use `` this config applies to.
+- Followed by `--` to turn it on, or `--no-` to turn it off.
+- Replace `` with the config you are enabling or disabling
-Boolean config structure:
```text
-dbt --
-dbt --no-
+dbt --
+dbt --no-
```
-Boolean config example:
+### Example
```text
-dbt --version-check run
-dbt --no-version-check run
+dbt run --version-check
+dbt run --no-version-check
```
-
\ No newline at end of file
+
+
+
+
+
diff --git a/website/docs/reference/global-configs/logs.md b/website/docs/reference/global-configs/logs.md
index f5f1b3f814b..8c819193fc6 100644
--- a/website/docs/reference/global-configs/logs.md
+++ b/website/docs/reference/global-configs/logs.md
@@ -14,6 +14,9 @@ The `LOG_FORMAT` config specifies how dbt's logs should be formatted. If the val
dbt --log-format json run
{"code": "A001", "data": {"v": "=1.0.0"}, "invocation_id": "1193e449-4b7a-4eb1-8e8e-047a8b3b7973", "level": "info", "log_version": 1, "msg": "Running with dbt=1.0.0", "node_info": {}, "pid": 35098, "thread_name": "MainThread", "ts": "2021-12-03T10:46:59.928217Z", "type": "log_line"}
```
+
+
+
To set the `LOG_FORMAT_FILE` type output for the file without impacting the console log format, use the `log-format-file` flag.
@@ -37,8 +40,6 @@ See [structured logging](/reference/events-logging#structured-logging) for more
:::
-
-
### Log Level
@@ -124,7 +125,16 @@ dbt --quiet run
### Color
-You can set the color preferences for the file logs only using the `--use-colors-file / --no-use-colors-file` flags.
+You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags.
+
+
+
+```yaml
+config:
+ use_colors_file: False
+```
+
+
```text
dbt --use-colors-file run
diff --git a/website/docs/reference/global-configs/print-output.md b/website/docs/reference/global-configs/print-output.md
index 83280677229..112b92b546f 100644
--- a/website/docs/reference/global-configs/print-output.md
+++ b/website/docs/reference/global-configs/print-output.md
@@ -74,13 +74,24 @@ config:
use_colors: False
```
+
+
```text
dbt --use-colors run
dbt --no-use-colors run
```
-You can set the color preferences for the file logs only using the `--use-colors-file / --no-use-colors-file` flags.
+You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags.
+
+
+
+```yaml
+config:
+ use_colors_file: False
+```
+
+
```text
dbt --use-colors-file run
@@ -88,5 +99,3 @@ dbt --no-use-colors-file run
```
-
-
\ No newline at end of file
diff --git a/website/docs/reference/global-configs/usage-stats.md b/website/docs/reference/global-configs/usage-stats.md
index ea02fe0bb59..1f9492f4a43 100644
--- a/website/docs/reference/global-configs/usage-stats.md
+++ b/website/docs/reference/global-configs/usage-stats.md
@@ -8,14 +8,14 @@ We want to build the best version of dbt possible, and a crucial part of that is
Usage statistics are fired when dbt is invoked and when models are run. These events contain basic platform information (OS + python version) and metadata such as whether the invocation succeeded, how long it took, an anonymized hash key representing the raw model content, and number of nodes that were run. You can see all the event definitions in [`tracking.py`](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/tracking.py).
-By default this is turned on – you can opt out of event tracking at any time by adding the following to your `profiles.yml` file:
+By default, this is enabled. dbt Core users can opt out of event tracking at any time by adding the following to your `profiles.yml` file:
```yaml
config:
send_anonymous_usage_stats: False
```
-You can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables).
+dbt Core users can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables).
`DO_NOT_TRACK=1` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=False`
`DO_NOT_TRACK=0` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=True`
diff --git a/website/docs/reference/macro-properties.md b/website/docs/reference/macro-properties.md
index 91ba52de9ca..91a616ded0d 100644
--- a/website/docs/reference/macro-properties.md
+++ b/website/docs/reference/macro-properties.md
@@ -1,10 +1,13 @@
---
title: Macro properties
+id: macro-properties
---
-Macro properties can be declared in `.yml` files.
+import PropsCallout from '/snippets/_config-prop-callout.md';
-You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders.
+Macro properties can be declared in any `properties.yml` file.
+
+You can name these files `whatever_you_want.yml` and nest them arbitrarily deep in sub-folders.
@@ -27,9 +30,3 @@ macros:
```
-
-
-
-* `v0.16.0`: The ability to declare macro properties was introduced.
-
-
diff --git a/website/docs/reference/model-properties.md b/website/docs/reference/model-properties.md
index 730432c88af..63adc1f0d63 100644
--- a/website/docs/reference/model-properties.md
+++ b/website/docs/reference/model-properties.md
@@ -18,7 +18,7 @@ models:
show: true | false
[latest_version](/reference/resource-properties/latest_version):
[deprecation_date](/reference/resource-properties/deprecation_date):
- [access](/reference/resource-properties/access): private | protected | public
+ [access](/reference/resource-configs/access): private | protected | public
[config](/reference/resource-properties/config):
[](/reference/model-configs):
[constraints](/reference/resource-properties/constraints):
@@ -46,7 +46,7 @@ models:
[description](/reference/resource-properties/description):
[docs](/reference/resource-configs/docs):
show: true | false
- [access](/reference/resource-properties/access): private | protected | public
+ [access](/reference/resource-configs/access): private | protected | public
[constraints](/reference/resource-properties/constraints):
-
[config](/reference/resource-properties/config):
diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md
index 6079e53793a..03c3b2aac12 100644
--- a/website/docs/reference/node-selection/defer.md
+++ b/website/docs/reference/node-selection/defer.md
@@ -2,13 +2,6 @@
title: "Defer"
---
-
-
-- **v0.18.0**: Introduced `--defer` and `--state` flags as beta features.
-- **v0.19.0**: Changed `--defer` to use the current environment's resource, if it exists, and only fall back to the other environment's resource if the first does not. Also added support for `dbt test --defer`.
-
-
-
Defer is a powerful feature that makes it possible to run a subset of models or tests in a [sandbox environment](/docs/environments-in-dbt) without having to first build their upstream parents. This can save time and computational resources when you want to test a small number of models in a large project.
Defer requires that a manifest from a previous dbt invocation be passed to the `--state` flag or env var. Together with the `state:` selection method, these features enable "Slim CI". Read more about [state](/reference/node-selection/syntax#about-node-selection).
@@ -24,16 +17,16 @@ It is possible to use separate state for `state:modified` and `--defer`, by pass
### Usage
```shell
-$ dbt run --select [...] --defer --state path/to/artifacts
-$ dbt test --select [...] --defer --state path/to/artifacts
+dbt run --select [...] --defer --state path/to/artifacts
+dbt test --select [...] --defer --state path/to/artifacts
```
```shell
-$ dbt run --models [...] --defer --state path/to/artifacts
-$ dbt test --models [...] --defer --state path/to/artifacts
+dbt run --models [...] --defer --state path/to/artifacts
+dbt test --models [...] --defer --state path/to/artifacts
```
@@ -108,7 +101,7 @@ I want to test my changes. Nothing exists in my development schema, `dev_alice`.
```shell
-$ dbt run --select model_b
+dbt run --select "model_b"
```
@@ -135,7 +128,7 @@ Unless I had previously run `model_a` into this development environment, `dev_al
```shell
-$ dbt run --select model_b --defer --state prod-run-artifacts
+dbt run --select "model_b" --defer --state prod-run-artifacts
```
@@ -193,7 +186,7 @@ models:
```shell
-dbt test --select model_b
+dbt test --select "model_b"
```
@@ -218,7 +211,7 @@ The `relationships` test requires both `model_a` and `model_b`. Because I did no
```shell
-dbt test --select model_b --defer --state prod-run-artifacts
+dbt test --select "model_b" --defer --state prod-run-artifacts
```
diff --git a/website/docs/reference/node-selection/exclude.md b/website/docs/reference/node-selection/exclude.md
index 9ad4bd1cc0e..d2c140d1bb5 100644
--- a/website/docs/reference/node-selection/exclude.md
+++ b/website/docs/reference/node-selection/exclude.md
@@ -7,19 +7,19 @@ sidebar_label: "Exclude"
dbt provides an `--exclude` flag with the same semantics as `--select`. Models specified with the `--exclude` flag will be removed from the set of models selected with `--select`.
```bash
-$ dbt run --select my_package.*+ --exclude my_package.a_big_model+ # select all models in my_package and their children except a_big_model and its children
+dbt run --select "my_package".*+ --exclude "my_package.a_big_model+" # select all models in my_package and their children except a_big_model and its children
```
Exclude a specific resource by its name or lineage:
```bash
# test
-$ dbt test --exclude not_null_orders_order_id # test all models except the not_null_orders_order_id test
-$ dbt test --exclude orders # test all models except tests associated with the orders model
+dbt test --exclude "not_null_orders_order_id" # test all models except the not_null_orders_order_id test
+dbt test --exclude "orders" # test all models except tests associated with the orders model
# seed
-$ dbt seed --exclude account_parent_mappings # load all seeds except account_parent_mappings
+dbt seed --exclude "account_parent_mappings" # load all seeds except account_parent_mappings
# snapshot
-$ dbt snapshot --exclude snap_order_statuses # execute all snapshots except snap_order_statuses
+dbt snapshot --exclude "snap_order_statuses" # execute all snapshots except snap_order_statuses
```
diff --git a/website/docs/reference/node-selection/graph-operators.md b/website/docs/reference/node-selection/graph-operators.md
index 1e7c88fadfc..8cba43e1b52 100644
--- a/website/docs/reference/node-selection/graph-operators.md
+++ b/website/docs/reference/node-selection/graph-operators.md
@@ -7,9 +7,9 @@ If placed at the front of the model selector, `+` will select all parents of the
```bash
- $ dbt run --select my_model+ # select my_model and all children
- $ dbt run --select +my_model # select my_model and all parents
- $ dbt run --select +my_model+ # select my_model, and all of its parents and children
+dbt run --select "my_model+" # select my_model and all children
+dbt run --select "+my_model" # select my_model and all parents
+dbt run --select "+my_model+" # select my_model, and all of its parents and children
```
@@ -20,9 +20,9 @@ to step through.
```bash
- $ dbt run --select my_model+1 # select my_model and its first-degree children
- $ dbt run --select 2+my_model # select my_model, its first-degree parents, and its second-degree parents ("grandparents")
- $ dbt run --select 3+my_model+4 # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree
+dbt run --select "my_model+1" # select my_model and its first-degree children
+dbt run --select "2+my_model" # select my_model, its first-degree parents, and its second-degree parents ("grandparents")
+dbt run --select "3+my_model+4" # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree
```
@@ -32,14 +32,5 @@ The `@` operator is similar to `+`, but will also include _the parents of the ch
```bash
-$ dbt run --models @my_model # select my_model, its children, and the parents of its children
+dbt run --models @my_model # select my_model, its children, and the parents of its children
```
-
-### The "star" operator
-The `*` operator matches all models within a package or directory.
-
-
- ```bash
- $ dbt run --select snowplow.* # run all of the models in the snowplow package
- $ dbt run --select finance.base.* # run all of the models in models/finance/base
- ```
diff --git a/website/docs/reference/node-selection/methods.md b/website/docs/reference/node-selection/methods.md
index ff86d60c06a..e29612e3401 100644
--- a/website/docs/reference/node-selection/methods.md
+++ b/website/docs/reference/node-selection/methods.md
@@ -34,8 +34,8 @@ The `tag:` method is used to select models that match a specified [tag](/referen
```bash
- $ dbt run --select tag:nightly # run all models with the `nightly` tag
- ```
+dbt run --select "tag:nightly" # run all models with the `nightly` tag
+```
### The "source" method
@@ -43,16 +43,22 @@ The `source` method is used to select models that select from a specified [sourc
```bash
- $ dbt run --select source:snowplow+ # run all models that select from Snowplow sources
- ```
+dbt run --select "source:snowplow+" # run all models that select from Snowplow sources
+```
### The "resource_type" method
-Use the `resource_type` method to select nodes of a particular type (`model`, `source`, `exposure`, etc). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list).
+Use the `resource_type` method to select nodes of a particular type (`model`, `test`, `exposure`, and so on). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list).
```bash
- $ dbt build --select resource_type:exposure # build all resources upstream of exposures
- $ dbt list --select resource_type:test # list all tests in your project
- ```
+dbt build --select "resource_type:exposure" # build all resources upstream of exposures
+dbt list --select "resource_type:test" # list all tests in your project
+```
+
+Note: This method doesn't work for sources, so use the [`--resource-type`](/reference/commands/list) option of the list command instead:
+
+ ```bash
+dbt list --resource-type source
+```
### The "path" method
The `path` method is used to select models/sources defined at or under a specific path.
@@ -63,30 +69,39 @@ selectors unambiguous.
```bash
# These two selectors are equivalent
- dbt run --select path:models/staging/github
- dbt run --select models/staging/github
+ dbt run --select "path:models/staging/github"
+ dbt run --select "models/staging/github"
# These two selectors are equivalent
- dbt run --select path:models/staging/github/stg_issues.sql
- dbt run --select models/staging/github/stg_issues.sql
+ dbt run --select "path:models/staging/github/stg_issues.sql"
+ dbt run --select "models/staging/github/stg_issues.sql"
```
-### The "file" or "fqn" method
-The `file` or `fqn` method can be used to select a model by its filename, including the file extension (`.sql`).
+### The "file" method
+The `file` method can be used to select a model by its filename, including the file extension (`.sql`).
```bash
# These are equivalent
-dbt run --select file:some_model.sql
-dbt run --select some_model.sql
-dbt run --select some_model
-dbt run --select fqn:some_model # fqn is an abbreviation for "fully qualified name"
+dbt run --select "file:some_model.sql"
+dbt run --select "some_model.sql"
+dbt run --select "some_model"
```
+### The "fqn" method
+
+The `fqn` method is used to select nodes based off their "fully qualified names" (FQN) within the dbt graph. The default output of [`dbt list`](/reference/commands/list) is a listing of FQN.
+
+```bash
+dbt run --select "fqn:some_model"
+dbt run --select "fqn:your_project.some_model"
+dbt run --select "fqn:some_package.some_other_model"
+```
+
### The "package" method
The `package` method is used to select models defined within the root project
@@ -96,10 +111,10 @@ selectors unambiguous.
```bash
# These three selectors are equivalent
- dbt run --select package:snowplow
- dbt run --select snowplow
- dbt run --select snowplow.*
- ```
+ dbt run --select "package:snowplow"
+ dbt run --select "snowplow"
+ dbt run --select "snowplow.*"
+```
### The "config" method
@@ -109,10 +124,10 @@ The `config` method is used to select models that match a specified [node config
```bash
- $ dbt run --select config.materialized:incremental # run all models that are materialized incrementally
- $ dbt run --select config.schema:audit # run all models that are created in the `audit` schema
- $ dbt run --select config.cluster_by:geo_country # run all models clustered by `geo_country`
- ```
+dbt run --select "config.materialized:incremental" # run all models that are materialized incrementally
+dbt run --select "config.schema:audit" # run all models that are created in the `audit` schema
+dbt run --select "config.cluster_by:geo_country" # run all models clustered by `geo_country`
+```
@@ -120,7 +135,8 @@ The `config` method is used to select models that match a specified [node config
While most config values are strings, you can also use the `config` method to match boolean configs, dictionary keys, and values in lists.
For example, given a model with the following configurations:
-```
+
+```bash
{{ config(
materialized = 'incremental',
unique_key = ['column_a', 'column_b'],
@@ -133,27 +149,24 @@ select ...
You can select using any of the following:
```bash
-$ dbt ls -s config.materialized:incremental
-$ dbt ls -s config.unique_key:column_a
-$ dbt ls -s config.grants.select:reporter
-$ dbt ls -s config.transient:true
+dbt ls -s config.materialized:incremental
+dbt ls -s config.unique_key:column_a
+dbt ls -s config.grants.select:reporter
+dbt ls -s config.transient:true
```
### The "test_type" method
-
-In v1.0.0, test types were renamed: "singular" (instead of "data") and "generic" (instead of "schema")
-
The `test_type` method is used to select tests based on their type, `singular` or `generic`:
- ```bash
- $ dbt test --select test_type:generic # run all generic tests
- $ dbt test --select test_type:singular # run all singular tests
- ```
+```bash
+dbt test --select "test_type:generic" # run all generic tests
+dbt test --select "test_type:singular" # run all singular tests
+```
### The "test_name" method
@@ -164,10 +177,10 @@ that defines it. For more information about how generic tests are defined, read
```bash
- $ dbt test --select test_name:unique # run all instances of the `unique` test
- $ dbt test --select test_name:equality # run all instances of the `dbt_utils.equality` test
- $ dbt test --select test_name:range_min_max # run all instances of a custom schema test defined in the local project, `range_min_max`
- ```
+dbt test --select "test_name:unique" # run all instances of the `unique` test
+dbt test --select "test_name:equality" # run all instances of the `dbt_utils.equality` test
+dbt test --select "test_name:range_min_max" # run all instances of a custom schema test defined in the local project, `range_min_max`
+```
### The "state" method
@@ -192,9 +205,9 @@ The `state` method is used to select nodes by comparing them against a previous
```bash
- $ dbt test --select state:new # run all tests on new models + and new tests on old models
- $ dbt run --select state:modified # run all models that have been modified
- $ dbt ls --select state:modified # list all modified nodes (not just models)
+dbt test --select "state:new " # run all tests on new models + and new tests on old models
+dbt run --select "state:modified" # run all models that have been modified
+dbt ls --select "state:modified" # list all modified nodes (not just models)
```
@@ -224,41 +237,32 @@ The `exposure` method is used to select parent resources of a specified [exposur
```bash
- $ dbt run --select +exposure:weekly_kpis # run all models that feed into the weekly_kpis exposure
- $ dbt test --select +exposure:* # test all resources upstream of all exposures
- $ dbt ls --select +exposure:* --resource-type source # list all sources upstream of all exposures
- ```
+dbt run --select "+exposure:weekly_kpis" # run all models that feed into the weekly_kpis exposure
+dbt test --select "+exposure:*" # test all resources upstream of all exposures
+dbt ls --select "+exposure:*" --resource-type source # list all sources upstream of all exposures
+```
### The "metric" method
-New in v1.0.0
The `metric` method is used to select parent resources of a specified [metric](/docs/build/metrics). Use in conjunction with the `+` operator.
```bash
-$ dbt build --select +metric:weekly_active_users # build all resources upstream of weekly_active_users metric
-$ dbt ls --select +metric:* --resource-type source # list all source tables upstream of all metrics
+dbt build --select "+metric:weekly_active_users" # build all resources upstream of weekly_active_users metric
+dbt ls --select "+metric:*" --resource-type source # list all source tables upstream of all metrics
```
### The "result" method
-New in v1.0.0
The `result` method is related to the `state` method described above and can be used to select resources based on their result status from a prior run. Note that one of the dbt commands [`run`, `test`, `build`, `seed`] must have been performed in order to create the result on which a result selector operates. You can use `result` selectors in conjunction with the `+` operator.
```bash
-$ dbt run --select result:error --state path/to/artifacts # run all models that generated errors on the prior invocation of dbt run
-$ dbt test --select result:fail --state path/to/artifacts # run all tests that failed on the prior invocation of dbt test
-$ dbt build --select 1+result:fail --state path/to/artifacts # run all the models associated with failed tests from the prior invocation of dbt build
-$ dbt seed --select result:error --state path/to/artifacts # run all seeds that generated errors on the prior invocation of dbt seed.
+dbt run --select "result:error" --state path/to/artifacts # run all models that generated errors on the prior invocation of dbt run
+dbt test --select "result:fail" --state path/to/artifacts # run all tests that failed on the prior invocation of dbt test
+dbt build --select "1+result:fail" --state path/to/artifacts # run all the models associated with failed tests from the prior invocation of dbt build
+dbt seed --select "result:error" --state path/to/artifacts # run all seeds that generated errors on the prior invocation of dbt seed.
```
### The "source_status" method
-
-
-Supported in v1.1 or newer.
-
-
-
-
Supported in v1.1 or higher.
@@ -273,8 +277,8 @@ After issuing one of the above commands, you can reference the source freshness
```bash
# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag.
-$ dbt source freshness # must be run again to compare current to previous state
-$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts
+dbt source freshness # must be run again to compare current to previous state
+dbt build --select "source_status:fresher+" --state path/to/prod/artifacts
```
@@ -283,16 +287,13 @@ $ dbt build --select source_status:fresher+ --state path/to/prod/artifacts
```bash
# You can also set the DBT_STATE environment variable instead of the --state flag.
-$ dbt source freshness # must be run again to compare current to previous state
-$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts
+dbt source freshness # must be run again to compare current to previous state
+dbt build --select "source_status:fresher+" --state path/to/prod/artifacts
```
-
-
-
### The "group" method
@@ -305,9 +306,9 @@ Supported in v1.5 or newer.
The `group` method is used to select models defined within a [group](/reference/resource-configs/group).
- ```bash
- dbt run --select group:finance # run all models that belong to the finance group.
- ```
+```bash
+dbt run --select "group:finance" # run all models that belong to the finance group.
+```
@@ -321,12 +322,12 @@ Supported in v1.5 or newer.
-The `access` method selects models based on their [access](/reference/resource-properties/access) property.
+The `access` method selects models based on their [access](/reference/resource-configs/access) property.
```bash
-dbt list --select access:public # list all public models
-dbt list --select access:private # list all private models
-dbt list --select access:protected # list all protected models
+dbt list --select "access:public" # list all public models
+dbt list --select "access:private" # list all private models
+dbt list --select "access:protected" # list all protected models
```
@@ -344,11 +345,26 @@ Supported in v1.5 or newer.
The `version` method selects [versioned models](/docs/collaborate/govern/model-versions) based on their [version identifier](/reference/resource-properties/versions) and [latest version](/reference/resource-properties/latest_version).
```bash
-dbt list --select version:latest # only 'latest' versions
-dbt list --select version:prerelease # versions newer than the 'latest' version
+dbt list --select "version:latest" # only 'latest' versions
+dbt list --select "version:prerelease" # versions newer than the 'latest' version
dbt list --select version:old # versions older than the 'latest' version
-dbt list --select version:none # models that are *not* versioned
+dbt list --select "version:none" # models that are *not* versioned
```
+
+### The "semantic_model" method
+
+Supported in v1.6 or newer.
+
+
+
+The `semantic_model` method selects [semantic models](/docs/build/semantic-models).
+
+```bash
+dbt list --select semantic_model:* # list all semantic models
+dbt list --select +semantic_model:orders # list your semantic model named "orders" and all upstream resources
+```
+
+
\ No newline at end of file
diff --git a/website/docs/reference/node-selection/putting-it-together.md b/website/docs/reference/node-selection/putting-it-together.md
index 8faf02e6cc9..48fc5188b32 100644
--- a/website/docs/reference/node-selection/putting-it-together.md
+++ b/website/docs/reference/node-selection/putting-it-together.md
@@ -4,16 +4,16 @@ title: "Putting it together"
```bash
- $ dbt run --select my_package.*+ # select all models in my_package and their children
- $ dbt run --select +some_model+ # select some_model and all parents and children
+dbt run --select "my_package.*+" # select all models in my_package and their children
+dbt run --select "+some_model+" # select some_model and all parents and children
- $ dbt run --select tag:nightly+ # select "nightly" models and all children
- $ dbt run --select +tag:nightly+ # select "nightly" models and all parents and children
+dbt run --select "tag:nightly+" # select "nightly" models and all children
+dbt run --select "+tag:nightly+" # select "nightly" models and all parents and children
- $ dbt run --select @source:snowplow # build all models that select from snowplow sources, plus their parents
+dbt run --select "@source:snowplow" # build all models that select from snowplow sources, plus their parents
- $ dbt test --select config.incremental_strategy:insert_overwrite,test_name:unique # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy
- ```
+dbt test --select "config.incremental_strategy:insert_overwrite,test_name:unique" # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy
+```
@@ -22,8 +22,8 @@ and feed exports, while _excluding_ the biggest incremental models (and one othe
```bash
- $ dbt run --select @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing
- ```
+dbt run --select "@source:snowplow,tag:nightly models/export" --exclude "package:snowplow,config.materialized:incremental export_performance_timing"
+```
This command selects all models that:
diff --git a/website/docs/reference/node-selection/set-operators.md b/website/docs/reference/node-selection/set-operators.md
index 7d6b6c2411c..af399b9cad5 100644
--- a/website/docs/reference/node-selection/set-operators.md
+++ b/website/docs/reference/node-selection/set-operators.md
@@ -11,7 +11,7 @@ Run snowplow_sessions, all ancestors of snowplow_sessions, fct_orders, and all a
```bash
- $ dbt run --select +snowplow_sessions +fct_orders
+dbt run --select "+snowplow_sessions +fct_orders"
```
### Intersections
@@ -22,15 +22,15 @@ Run all the common ancestors of snowplow_sessions and fct_orders:
```bash
- $ dbt run --select +snowplow_sessions,+fct_orders
- ```
+dbt run --select "+snowplow_sessions,+fct_orders"
+```
Run all the common descendents of stg_invoices and stg_accounts:
```bash
- $ dbt run --select stg_invoices+,stg_accounts+
+dbt run --select "stg_invoices+,stg_accounts+"
```
@@ -38,5 +38,5 @@ Run models that are in the marts/finance subdirectory *and* tagged nightly:
```bash
- $ dbt run --select marts.finance,tag:nightly
- ```
+dbt run --select "marts.finance,tag:nightly"
+```
diff --git a/website/docs/reference/node-selection/state-comparison-caveats.md b/website/docs/reference/node-selection/state-comparison-caveats.md
index 6ae156fddcf..73947c80a66 100644
--- a/website/docs/reference/node-selection/state-comparison-caveats.md
+++ b/website/docs/reference/node-selection/state-comparison-caveats.md
@@ -27,8 +27,8 @@ The command `dbt test -s state:modified` will include both:
As long as you're adding or changing tests at the same time that you're adding or changing the resources (models, seeds, snapshots) they select from, all should work the way you expect with "simple" state selection:
```shell
-$ dbt run -s state:modified
-$ dbt test -s state:modified
+dbt run -s "state:modified"
+dbt test -s "state:modified"
```
This can get complicated, however. If you add a new test without modifying its underlying model, or add a test that selects from a new model and an old unmodified one, you may need to test a model without having first run it.
@@ -36,8 +36,8 @@ This can get complicated, however. If you add a new test without modifying its u
In v0.18.0, you needed to handle this by building the unmodified models needed for modified tests:
```shell
-$ dbt run -s state:modified @state:modified,1+test_type:data
-$ dbt test -s state:modified
+dbt run -s "state:modified @state:modified,1+test_type:data"
+dbt test -s "state:modified"
```
In v0.19.0, dbt added support for deferring upstream references when testing. If a test selects from a model that doesn't exist as a database object in your current environment, dbt will look to the other environment instead—the one defined in your state manifest. This enables you to use "simple" state selection without risk of query failure, but it may have some surprising consequences for tests with multiple parents. For instance, if you have a `relationships` test that depends on one modified model and one unmodified model, the test query will select from data "across" two different environments. If you limit or sample your data in development and CI, it may not make much sense to test for referential integrity, knowing there's a good chance of mismatch.
@@ -45,8 +45,8 @@ In v0.19.0, dbt added support for deferring upstream references when testing. If
If you're a frequent user of `relationships` tests or data tests, or frequently find yourself adding tests without modifying their underlying models, consider tweaking the selection criteria of your CI job. For instance:
```shell
-$ dbt run -s state:modified
-$ dbt test -s state:modified --exclude test_name:relationships
+dbt run -s "state:modified"
+dbt test -s "state:modified" --exclude "test_name:relationships"
```
### False positives
@@ -58,14 +58,7 @@ State comparison works by identifying discrepancies between two manifests. Thos
dbt will do its best to capture *only* changes that are the result of modifications made in development. In projects with intricate env-aware logic, dbt will err on the side of running too many models (i.e. false positives). Over the next several versions of dbt, we're working on:
- iterative improvements to dbt's built-in detective abilities
-- better options for more complex projects, in the form of more-specific subselectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704))
-
-
-
-- v0.18.0: All env-aware logic results in false positives during state comparison, when comparing against a manifest generated with a different target.
-- v0.19.0: dbt stores and compares unrendered Jinja expressions for configurations, allowing it to see past env-aware logic in `dbt_project.yml`.
-
-
+- better options for more complex projects, in the form of more-specific sub-selectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704))
State comparison is now able to detect env-aware config in `dbt_project.yml`. For instance, this target-based config would register as a modification in v0.18.0, but in v0.19.0 it no longer will:
diff --git a/website/docs/reference/node-selection/syntax.md b/website/docs/reference/node-selection/syntax.md
index 1a43a32e2bc..d0ea4a9acd8 100644
--- a/website/docs/reference/node-selection/syntax.md
+++ b/website/docs/reference/node-selection/syntax.md
@@ -14,6 +14,7 @@ dbt's node selection syntax makes it possible to run only specific resources in
| [compile](/reference/commands/compile) | `--select`, `--exclude`, `--selector`, `--inline` |
| [freshness](/reference/commands/source) | `--select`, `--exclude`, `--selector` |
| [build](/reference/commands/build) | `--select`, `--exclude`, `--selector`, `--resource-type`, `--defer` |
+| [docs generate](/reference/commands/cmd-docs) | `--select`, `--exclude`, `--selector` |
:::info Nodes and resources
@@ -24,6 +25,8 @@ We use the terms "
By default, `dbt run` executes _all_ of the models in the dependency graph; `dbt seed` creates all seeds, `dbt snapshot` performs every snapshot. The `--select` flag is used to specify a subset of nodes to execute.
+To follow [POSIX standards](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap12.html) and make things easier to understand, we recommend CLI users use quotes when passing arguments to the `--select` or `--exclude` option (including single or multiple space-delimited, or comma-delimited arguments). Not using quotes might not work reliably on all operating systems, terminals, and user interfaces. For example, `dbt run --select "my_dbt_project_name"` runs all models in your project.
+
### How does selection work?
1. dbt gathers all the resources that are matched by one or more of the `--select` criteria, in the order of selection methods (e.g. `tag:`), then graph operators (e.g. `+`), then finally set operators ([unions](/reference/node-selection/set-operators#unions), [intersections](/reference/node-selection/set-operators#intersections), [exclusions](/reference/node-selection/exclude)).
@@ -51,28 +54,28 @@ Examples:
```bash
- $ dbt run --select my_dbt_project_name # runs all models in your project
- $ dbt run --select my_dbt_model # runs a specific model
- $ dbt run --select path.to.my.models # runs all models in a specific directory
- $ dbt run --select my_package.some_model # run a specific model in a specific package
- $ dbt run --select tag:nightly # run models with the "nightly" tag
- $ dbt run --select path/to/models # run models contained in path/to/models
- $ dbt run --select path/to/my_model.sql # run a specific model by its path
+dbt run --select "my_dbt_project_name" # runs all models in your project
+dbt run --select "my_dbt_model" # runs a specific model
+dbt run --select "path.to.my.models" # runs all models in a specific directory
+dbt run --select "my_package.some_model" # run a specific model in a specific package
+dbt run --select "tag:nightly" # run models with the "nightly" tag
+dbt run --select "path/to/models" # run models contained in path/to/models
+dbt run --select "path/to/my_model.sql" # run a specific model by its path
```
dbt supports a shorthand language for defining subsets of nodes. This language uses the characters `+`, `@`, `*`, and `,`.
```bash
- # multiple arguments can be provided to --select
- $ dbt run --select my_first_model my_second_model
+# multiple arguments can be provided to --select
+ dbt run --select "my_first_model my_second_model"
- # these arguments can be projects, models, directory paths, tags, or sources
- $ dbt run --select tag:nightly my_model finance.base.*
+# these arguments can be projects, models, directory paths, tags, or sources
+dbt run --select "tag:nightly my_model finance.base.*"
- # use methods and intersections for more complex selectors
- $ dbt run --select path:marts/finance,tag:nightly,config.materialized:table
- ```
+# use methods and intersections for more complex selectors
+dbt run --select "path:marts/finance,tag:nightly,config.materialized:table"
+```
As your selection logic gets more complex, and becomes unwieldly to type out as command-line arguments,
consider using a [yaml selector](/reference/node-selection/yaml-selectors). You can use a predefined definition with the `--selector` flag.
@@ -93,7 +96,7 @@ by comparing code in the current project against the state manifest.
- [Deferring](/reference/node-selection/defer) to another environment, whereby dbt can identify upstream, unselected resources that don't exist in your current environment and instead "defer" their references to the environment provided by the state manifest.
- The [`dbt clone` command](/reference/commands/clone), whereby dbt can clone nodes based on their location in the manifest provided to the `--state` flag.
-Together, the `state:` selector and deferral enable ["slim CI"](/guides/legacy/best-practices#run-only-modified-models-to-test-changes-slim-ci). We expect to add more features in future releases that can leverage artifacts passed to the `--state` flag.
+Together, the `state:` selector and deferral enable ["slim CI"](/best-practices/best-practice-workflows#run-only-modified-models-to-test-changes-slim-ci). We expect to add more features in future releases that can leverage artifacts passed to the `--state` flag.
### Establishing state
@@ -150,7 +153,7 @@ After issuing one of the above commands, you can reference the results by adding
```bash
# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag.
-$ dbt run --select result: --defer --state path/to/prod/artifacts
+dbt run --select "result: --defer --state path/to/prod/artifacts"
```
The available options depend on the resource (node) type:
@@ -169,19 +172,11 @@ The available options depend on the resource (node) type:
The state and result selectors can also be combined in a single invocation of dbt to capture errors from a previous run OR any new or modified models.
```bash
-$ dbt run --select result:+ state:modified+ --defer --state ./
+dbt run --select "result:+ state:modified+ --defer --state ./"
```
### Fresh rebuilds
-
-
-Only supported by v1.1 or newer.
-
-
-
-
-
Only supported by v1.1 or newer.
When a job is selected, dbt Cloud will surface the artifacts from that job's most recent successful run. dbt will then use those artifacts to determine the set of fresh sources. In your job commands, you can signal to dbt to run and test only on these fresher sources and their children by including the `source_status:fresher+` argument. This requires both previous and current state to have the `sources.json` artifact be available. Or plainly said, both job states need to run `dbt source freshness`.
@@ -191,21 +186,13 @@ As example:
```bash
# Command step order
dbt source freshness
-dbt build --select source_status:fresher+
+dbt build --select "source_status:fresher+"
```
-
-For more example commands, refer to [Pro-tips for workflows](/guides/legacy/best-practices.md#pro-tips-for-workflows).
+For more example commands, refer to [Pro-tips for workflows](/best-practices/best-practice-workflows#pro-tips-for-workflows).
### The "source_status" status
-
-
-Only supported by v1.1 or newer.
-
-
-
-
Only supported by v1.1 or newer.
@@ -218,7 +205,6 @@ After issuing one of the above commands, you can reference the source freshness
```bash
# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag.
-$ dbt source freshness # must be run again to compare current to previous state
-$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts
+dbt source freshness # must be run again to compare current to previous state
+dbt build --select "source_status:fresher+" --state path/to/prod/artifacts
```
-
diff --git a/website/docs/reference/node-selection/test-selection-examples.md b/website/docs/reference/node-selection/test-selection-examples.md
index 85141c8cd01..feb3898c230 100644
--- a/website/docs/reference/node-selection/test-selection-examples.md
+++ b/website/docs/reference/node-selection/test-selection-examples.md
@@ -11,34 +11,22 @@ Like all resource types, tests can be selected **directly**, by methods and oper
Unlike other resource types, tests can also be selected **indirectly**. If a selection method or operator includes a test's parent(s), the test will also be selected. [See below](#indirect-selection) for more details.
-
-
- `v1.0.0`: Renamed the `--greedy` flag/property to `indirect_selection`, and set its default back to "eager" (pre-v0.20). You can achieve the "cautious" behavior introduced in v0.20 by setting the flag/property to `cautious`.
-
-
-
Test selection is powerful, and we know it can be tricky. To that end, we've included lots of examples below:
### Direct selection
-
-
-`v1.0.0`: Renamed test types: "generic" (formerly "schema") and "singular" (formerly "data"). Removed support for the `--schema` and `--data` flags.
-
-
-
Run generic tests only:
```bash
- $ dbt test --select test_type:generic
+ dbt test --select "test_type:generic"
```
Run singular tests only:
```bash
- $ dbt test --select test_type:singular
+ dbt test --select "test_type:singular"
```
In both cases, `test_type` checks a property of the test itself. These are forms of "direct" test selection.
@@ -99,8 +87,8 @@ By default, a test will run when ANY parent is selected; we call this "eager" in
In this mode, any test that depends on unbuilt resources will raise an error.
```shell
-$ dbt test --select orders
-$ dbt build --select orders
+dbt test --select "orders"
+dbt build --select "orders"
```
@@ -114,8 +102,10 @@ It will only include tests whose references are each within the selected nodes.
Put another way, it will prevent tests from running if one or more of its parents is unselected.
```shell
-$ dbt test --select orders --indirect-selection=cautious
-$ dbt build --select orders --indirect-selection=cautious
+
+dbt test --select "orders" --indirect-selection=cautious
+dbt build --select "orders" --indirect-selection=cautious
+
```
@@ -134,8 +124,8 @@ By default, a test will run when ANY parent is selected; we call this "eager" in
In this mode, any test that depends on unbuilt resources will raise an error.
```shell
-$ dbt test --select orders
-$ dbt build --select orders
+dbt test --select "orders"
+dbt build --select "orders"
```
@@ -149,8 +139,10 @@ It will only include tests whose references are each within the selected nodes.
Put another way, it will prevent tests from running if one or more of its parents is unselected.
```shell
-$ dbt test --select orders --indirect-selection=cautious
-$ dbt build --select orders --indirect-selection=cautious
+
+dbt test --select "orders" --indirect-selection=cautious
+dbt build --select "orders" --indirect-selection=cautious
+
```
@@ -164,8 +156,9 @@ It will only include tests whose references are each within the selected nodes (
This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input).
```shell
-$ dbt test --select orders --indirect-selection=buildable
-$ dbt build --select orders --indirect-selection=buildable
+dbt test --select "orders" --indirect-selection=buildable
+dbt build --select "orders" --indirect-selection=buildable
+
```
@@ -184,8 +177,8 @@ By default, a test will run when ANY parent is selected; we call this "eager" in
In this mode, any test that depends on unbuilt resources will raise an error.
```shell
-$ dbt test --select orders
-$ dbt build --select orders
+dbt test --select "orders"
+dbt build --select "orders"
```
@@ -199,8 +192,9 @@ It will only include tests whose references are each within the selected nodes.
Put another way, it will prevent tests from running if one or more of its parents is unselected.
```shell
-$ dbt test --select orders --indirect-selection=cautious
-$ dbt build --select orders --indirect-selection=cautious
+dbt test --select "orders" --indirect-selection=cautious
+dbt build --select "orders" --indirect-selection=cautious
+
```
@@ -214,8 +208,8 @@ It will only include tests whose references are each within the selected nodes (
This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input).
```shell
-$ dbt test --select orders --indirect-selection=buildable
-$ dbt build --select orders --indirect-selection=buildable
+dbt test --select "orders" --indirect-selection=buildable
+dbt build --select "orders" --indirect-selection=buildable
```
@@ -225,8 +219,10 @@ $ dbt build --select orders --indirect-selection=buildable
This mode will only include tests whose references are each within the selected nodes and will ignore all tests from attached nodes.
```shell
-$ dbt test --select orders --indirect-selection=empty
-$ dbt build --select orders --indirect-selection=empty
+
+dbt test --select "orders" --indirect-selection=empty
+dbt build --select "orders" --indirect-selection=empty
+
```
@@ -246,22 +242,25 @@ The following examples should feel somewhat familiar if you're used to executing
```bash
# Run tests on a model (indirect selection)
- $ dbt test --select customers
+ dbt test --select "customers"
+
+ # Run tests on two or more specific models (indirect selection)
+ dbt test --select "customers orders"
# Run tests on all models in the models/staging/jaffle_shop directory (indirect selection)
- $ dbt test --select staging.jaffle_shop
+ dbt test --select "staging.jaffle_shop"
# Run tests downstream of a model (note this will select those tests directly!)
- $ dbt test --select stg_customers+
+ dbt test --select "stg_customers+"
# Run tests upstream of a model (indirect selection)
- $ dbt test --select +stg_customers
+ dbt test --select "+stg_customers"
# Run tests on all models with a particular tag (direct + indirect)
- $ dbt test --select tag:my_model_tag
+ dbt test --select "tag:my_model_tag"
# Run tests on all models with a particular materialization (indirect selection)
- $ dbt test --select config.materialized:table
+ dbt test --select "config.materialized:table"
```
@@ -270,16 +269,20 @@ The following examples should feel somewhat familiar if you're used to executing
```bash
# tests on all sources
- $ dbt test --select source:*
+
+ dbt test --select "source:*"
# tests on one source
- $ dbt test --select source:jaffle_shop
+ dbt test --select "source:jaffle_shop"
+
+ # tests on two or more specific sources
+ dbt test --select "source:jaffle_shop source:raffle_bakery"
# tests on one source table
- $ dbt test --select source:jaffle_shop.customers
+ dbt test --select "source:jaffle_shop.customers"
# tests on everything _except_ sources
- $ dbt test --exclude source:*
+ dbt test --exclude "source:*"
```
### More complex selection
@@ -288,10 +291,12 @@ Through the combination of direct and indirect selection, there are many ways to
```bash
- $ dbt test --select assert_total_payment_amount_is_positive # directly select the test by name
- $ dbt test --select payments,test_type:singular # indirect selection, v1.2
- $ dbt test --select payments,test_type:data # indirect selection, v0.18.0
- $ dbt test --select payments --data # indirect selection, earlier versions
+
+ dbt test --select "assert_total_payment_amount_is_positive" # directly select the test by name
+ dbt test --select "payments,test_type:singular" # indirect selection, v1.2
+ dbt test --select "payments,test_type:data" # indirect selection, v0.18.0
+ dbt test --select "payments" --data # indirect selection, earlier versions
+
```
@@ -300,13 +305,14 @@ Through the combination of direct and indirect selection, there are many ways to
```bash
# Run tests on all models with a particular materialization
- $ dbt test --select config.materialized:table
+ dbt test --select "config.materialized:table"
# Run tests on all seeds, which use the 'seed' materialization
- $ dbt test --select config.materialized:seed
+ dbt test --select "config.materialized:seed"
# Run tests on all snapshots, which use the 'snapshot' materialization
- $ dbt test --select config.materialized:snapshot
+ dbt test --select "config.materialized:snapshot"
+
```
Note that this functionality may change in future versions of dbt.
@@ -324,8 +330,8 @@ models:
- name: orders
columns:
- name: order_id
- tests:
tags: [my_column_tag]
+ tests:
- unique
```
@@ -334,7 +340,8 @@ models:
```bash
- $ dbt test --select tag:my_column_tag
+ dbt test --select "tag:my_column_tag"
+
```
Currently, tests "inherit" tags applied to columns, sources, and source tables. They do _not_ inherit tags applied to models, seeds, or snapshots. In all likelihood, those tests would still be selected indirectly, because the tag selects its parent. This is a subtle distinction, and it may change in future versions of dbt.
@@ -362,5 +369,6 @@ models:
```bash
- $ dbt test --select tag:my_test_tag
+ dbt test --select "tag:my_test_tag"
+
```
diff --git a/website/docs/reference/node-selection/yaml-selectors.md b/website/docs/reference/node-selection/yaml-selectors.md
index 78342e32779..1e3f8d8d1e2 100644
--- a/website/docs/reference/node-selection/yaml-selectors.md
+++ b/website/docs/reference/node-selection/yaml-selectors.md
@@ -34,6 +34,7 @@ Each `definition` is comprised of one or more arguments, which can be one of the
Use the `union` and `intersection` operator-equivalent keywords to organize multiple arguments.
### CLI-style
+
```yml
definition:
'tag:nightly'
@@ -42,6 +43,7 @@ definition:
This simple syntax supports use of the `+`, `@`, and `*` [graph](/reference/node-selection/graph-operators) operators, but it does not support [set](/reference/node-selection/set-operators) operators or `exclude`.
### Key-value
+
```yml
definition:
tag: nightly
@@ -317,7 +319,7 @@ selectors:
Then in our job definition:
```bash
-$ dbt run --selector nightly_diet_snowplow
+dbt run --selector nightly_diet_snowplow
```
## Default
@@ -325,6 +327,7 @@ $ dbt run --selector nightly_diet_snowplow
Selectors may define a boolean `default` property. If a selector has `default: true`, dbt will use this selector's criteria when tasks do not define their own selection criteria.
Let's say we define a default selector that only selects resources defined in our root project:
+
```yml
selectors:
- name: root_project_only
@@ -338,16 +341,18 @@ selectors:
```
If I run an "unqualified" command, dbt will use the selection criteria defined in `root_project_only`—that is, dbt will only build / freshness check / generate compiled SQL for resources defined in my root project.
+
```
-$ dbt build
-$ dbt source freshness
-$ dbt docs generate
+dbt build
+dbt source freshness
+dbt docs generate
```
If I run a command that defines its own selection criteria (via `--select`, `--exclude`, or `--selector`), dbt will ignore the default selector and use the flag criteria instead. It will not try to combine the two.
-```
-$ dbt run --select model_a
-$ dbt run --exclude model_a
+
+```bash
+dbt run --select "model_a"
+dbt run --exclude model_a
```
Only one selector may set `default: true` for a given invocation; otherwise, dbt will return an error. You may use a Jinja expression to adjust the value of `default` depending on the environment, however:
diff --git a/website/docs/reference/programmatic-invocations.md b/website/docs/reference/programmatic-invocations.md
index 8bd9bf84047..dfd5bae09e6 100644
--- a/website/docs/reference/programmatic-invocations.md
+++ b/website/docs/reference/programmatic-invocations.md
@@ -2,7 +2,7 @@
title: "Programmatic invocations"
---
-In v1.5, dbt-core added support for programmatic invocations. The intent is to expose the existing dbt CLI via a Python entry point, such that top-level commands are callable from within a Python script or application.
+In v1.5, dbt-core added support for programmatic invocations. The intent is to expose the existing dbt Core CLI via a Python entry point, such that top-level commands are callable from within a Python script or application.
The entry point is a `dbtRunner` class, which allows you to `invoke` the same commands as on the CLI.
@@ -30,7 +30,7 @@ Each command returns a `dbtRunnerResult` object, which has three attributes:
- `result`: If the command completed (successfully or with handled errors), its result(s). Return type varies by command.
- `exception`: If the dbt invocation encountered an unhandled error and did not complete, the exception it encountered.
-There is a 1:1 correspondence between [CLI exit codes](reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation:
+There is a 1:1 correspondence between [CLI exit codes](/reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation:
| Scenario | CLI Exit Code | `success` | `result` | `exception` |
|---------------------------------------------------------------------------------------------|--------------:|-----------|-------------------|-------------|
diff --git a/website/docs/reference/project-configs/asset-paths.md b/website/docs/reference/project-configs/asset-paths.md
index 97204923cb9..1fb3cf9f260 100644
--- a/website/docs/reference/project-configs/asset-paths.md
+++ b/website/docs/reference/project-configs/asset-paths.md
@@ -15,12 +15,6 @@ asset-paths: [directorypath]
## Definition
Optionally specify a custom list of directories to copy to the `target` directory as part of the `docs generate` command. This is useful for rendering images in your repository in your project documentation.
-
-
-* `v0.18.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details.
-
-
-
## Default
By default, dbt will not copy any additional files as part of docs generate, i.e. `asset-paths: []`
diff --git a/website/docs/reference/project-configs/clean-targets.md b/website/docs/reference/project-configs/clean-targets.md
index 119630b00b1..9b464840723 100644
--- a/website/docs/reference/project-configs/clean-targets.md
+++ b/website/docs/reference/project-configs/clean-targets.md
@@ -3,12 +3,6 @@ datatype: [directorypath]
default_value: [target_path]
---
-
-
-- **v1.0.0:** The `modules-path` has been updated to be [`packages-install-path`](/reference/project-configs/packages-install-path). The default value has also been updated to be `dbt-packages` from `dbt-modules`.
-
-
-
```yml
diff --git a/website/docs/reference/project-configs/config-version.md b/website/docs/reference/project-configs/config-version.md
index 20947c03d62..804caf1328f 100644
--- a/website/docs/reference/project-configs/config-version.md
+++ b/website/docs/reference/project-configs/config-version.md
@@ -20,12 +20,7 @@ config-version: 2
## Definition
Specify your `dbt_project.yml` as using the v2 structure.
-
-
-* `v0.17.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details.
-* `v1.5.0`: This configuration was made optional.
-
-
+ This configuration is optional.
## Default
Without this configuration, dbt will assume your `dbt_project.yml` uses the version 1 syntax, which was deprecated in dbt v0.19.0.
diff --git a/website/docs/reference/project-configs/log-path.md b/website/docs/reference/project-configs/log-path.md
index daab17c5f10..29cad35d120 100644
--- a/website/docs/reference/project-configs/log-path.md
+++ b/website/docs/reference/project-configs/log-path.md
@@ -47,12 +47,21 @@ The precedence order is: CLI flag > env var > `dbt_project.yml`
## Examples
-### Write logs to a subdirectory named `dbt_logs` instead of `logs`
+### Specify subdirectory using the project config file
```yml
log-path: dbt_logs
```
-
+
+
+
+### Specify subdirectory from the command line
+
+```bash
+dbt --log-path dbt_logs run
+```
+
+
diff --git a/website/docs/reference/project-configs/model-paths.md b/website/docs/reference/project-configs/model-paths.md
index 2129747af27..a0652432787 100644
--- a/website/docs/reference/project-configs/model-paths.md
+++ b/website/docs/reference/project-configs/model-paths.md
@@ -2,11 +2,6 @@
datatype: [directorypath]
default_value: [models]
---
-
-
-- **v1.0.0:** The config `source-paths` has been deprecated in favor of `model-paths`.
-
-
diff --git a/website/docs/reference/project-configs/on-run-start-on-run-end.md b/website/docs/reference/project-configs/on-run-start-on-run-end.md
index 2c5cde4c0c2..e1a3d7b761a 100644
--- a/website/docs/reference/project-configs/on-run-start-on-run-end.md
+++ b/website/docs/reference/project-configs/on-run-start-on-run-end.md
@@ -4,6 +4,8 @@ description: "Read this guide to understand the on-run-start and on-run-end conf
datatype: sql-statement | [sql-statement]
---
+import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md';
+
```yml
@@ -15,14 +17,8 @@ on-run-end: sql-statement | [sql-statement]
## Definition
-A SQL statement (or list of SQL statements) to be run at the start, or end, of the following commands:
-- `dbt run`
-- `dbt test`
-- `dbt seed`
-- `dbt snapshot`
-- `dbt build`
-- `dbt compile`
-- `dbt docs generate`
+
+A SQL statement (or list of SQL statements) to be run at the start or end of the following commands:
`on-run-start` and `on-run-end` hooks can also call macros that return SQL statements
@@ -33,34 +29,6 @@ A SQL statement (or list of SQL statements) to be run at the start, or end, of t
-
-
-### Grant privileges at the end of a run
-
-
-
-```yml
-on-run-end: "grant select on all tables in schema {{ target.schema }} group transformer"
-
-```
-
-
-
-### Grant multiple privileges at the end of a run
-
-
-
-```yml
-on-run-end:
- - "grant usage on schema {{ target.schema }} to group reporter"
- - "grant select on all tables in schema {{ target.schema }} group reporter"
-
-```
-
-
-
-
-
### Grant privileges on all schemas that dbt uses at the end of a run
This leverages the [schemas](/reference/dbt-jinja-functions/schemas) variable that is only available in an `on-run-end` hook.
diff --git a/website/docs/reference/project-configs/packages-install-path.md b/website/docs/reference/project-configs/packages-install-path.md
index 98142305357..157c630fd36 100644
--- a/website/docs/reference/project-configs/packages-install-path.md
+++ b/website/docs/reference/project-configs/packages-install-path.md
@@ -3,12 +3,6 @@ datatype: directorypath
default_value: dbt_packages
---
-
-
-- **v1.0.0:** The default config has changed from `modules-path` to `packages-install-path` with a new default value of `dbt_packages`.
-
-
-
```yml
diff --git a/website/docs/reference/project-configs/query-comment.md b/website/docs/reference/project-configs/query-comment.md
index 4d72bd4fcff..b1a73605e55 100644
--- a/website/docs/reference/project-configs/query-comment.md
+++ b/website/docs/reference/project-configs/query-comment.md
@@ -30,14 +30,6 @@ A string to inject as a comment in each query that dbt runs against your databas
The `query-comment` configuration can also call a macro that returns a string.
-
-
-* `v0.15.0`: The `query-comment` configuration was introduced
-* `v0.16.1`: Dictionary syntax introduced to allow comments to be appended
-* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels
-
-
-
## Default
By default, dbt will insert a comment at the top of your query containing the information including the dbt version, profile and target names, and node ids for the resources it runs. For example:
@@ -149,13 +141,6 @@ select ...
### BigQuery: include query comment items as job labels
-
-
-
-* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels
-
-
-
If `query-comment.job-label` is set to true, dbt will include the query comment items, if a dictionary, or the comment string, as job labels on the query it executes. These will be included in addition to labels specified in the [BigQuery-specific config](/reference/project-configs/query-comment#bigquery-include-query-comment-items-as-job-labels).
diff --git a/website/docs/reference/project-configs/quoting.md b/website/docs/reference/project-configs/quoting.md
index 92968ace1bd..821b920188c 100644
--- a/website/docs/reference/project-configs/quoting.md
+++ b/website/docs/reference/project-configs/quoting.md
@@ -28,13 +28,6 @@ Note that for BigQuery quoting configuration, `database` and `schema` should be
:::
-
-
-* `v0.10.1`: This configuration was introduced with a default value of `true` for each adapter.
-* `v0.11.0`: The default quoting config on Snowflake changed from `true` to `false`
-
-
-
## Default
The default values vary by database.
diff --git a/website/docs/reference/project-configs/require-dbt-version.md b/website/docs/reference/project-configs/require-dbt-version.md
index 892495dde45..85a502bff60 100644
--- a/website/docs/reference/project-configs/require-dbt-version.md
+++ b/website/docs/reference/project-configs/require-dbt-version.md
@@ -19,12 +19,6 @@ When you set this configuration, dbt sends a helpful error message for any user
If this configuration is not specified, no version check will occur.
-
-
-* `v0.13.0`: This configuration was introduced
-
-
-
:::info YAML Quoting
This configuration needs to be interpolated by the YAML parser as a string. As such, you should quote the value of the configuration, taking care to avoid whitespace. For example:
diff --git a/website/docs/reference/project-configs/seed-paths.md b/website/docs/reference/project-configs/seed-paths.md
index 92f7c5aa91f..614bda62cd2 100644
--- a/website/docs/reference/project-configs/seed-paths.md
+++ b/website/docs/reference/project-configs/seed-paths.md
@@ -3,12 +3,6 @@ datatype: [directorypath]
default_value: [data]
---
-
-
-- **v1.0.0:** The config `data-paths` has been deprecated in favor of `seed-paths`.
-
-
-
```yml
diff --git a/website/docs/reference/project-configs/snapshot-paths.md b/website/docs/reference/project-configs/snapshot-paths.md
index a623d48b20f..81b2759609d 100644
--- a/website/docs/reference/project-configs/snapshot-paths.md
+++ b/website/docs/reference/project-configs/snapshot-paths.md
@@ -14,12 +14,6 @@ snapshot-paths: [directorypath]
## Definition
Optionally specify a custom list of directories where [snapshots](/docs/build/snapshots) are located. Note that you cannot co-locate models and snapshots.
-
-
-* `v0.14.0`: Snapshots were introduced
-
-
-
## Default
By default, dbt will search for snapshots in the `snapshots` directory, i.e. `snapshot-paths: ["snapshots"]`
diff --git a/website/docs/reference/project-configs/target-path.md b/website/docs/reference/project-configs/target-path.md
index 54458efe512..fddc5a93c5e 100644
--- a/website/docs/reference/project-configs/target-path.md
+++ b/website/docs/reference/project-configs/target-path.md
@@ -48,12 +48,22 @@ The precedence order is: CLI flag > env var > `dbt_project.yml`
## Examples
-### Use a subdirectory named `compiled` for compiled files
+### Specify subdirectory using the project config file
```yml
-target-path: "compiled"
+target-path: "compiled_files"
```
+
+
+
+### Specify subdirectory from the command line
+
+```bash
+dbt run --target-path compiled_files
+```
+
+
\ No newline at end of file
diff --git a/website/docs/reference/project-configs/test-paths.md b/website/docs/reference/project-configs/test-paths.md
index e3f3cd2ccce..e3d0e0b76fa 100644
--- a/website/docs/reference/project-configs/test-paths.md
+++ b/website/docs/reference/project-configs/test-paths.md
@@ -3,12 +3,6 @@ datatype: [directorypath]
default_value: [test]
---
-
-
-* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory
-
-
-
```yml
diff --git a/website/docs/reference/project-configs/version.md b/website/docs/reference/project-configs/version.md
index 4c128727445..890ad8542a7 100644
--- a/website/docs/reference/project-configs/version.md
+++ b/website/docs/reference/project-configs/version.md
@@ -1,17 +1,24 @@
---
datatype: version
required: True
+keyword: project version, project versioning, dbt project versioning
---
-
+import VersionsCallout from '/snippets/_version-callout.md';
-dbt projects have two distinct types of the `version` tags. This field has a different meaning depending on its location.
+
+
+
+dbt projects have two distinct types of `version` tags. This field has a different meaning depending on its location.
## `dbt_project.yml` versions
-The version tag in a `dbt_project` file represents the version of your dbt project. Starting in version 1.5, `version` in the `dbt_project.yml` is an *optional parameter*. If specified, the version must be in a [semantic version](https://semver.org/) format, e.g. `1.0.0`. The default value if not specified is `None`.
+The version tag in a `dbt_project` file represents the version of your dbt project.
+
+Starting in dbt version 1.5, `version` in the `dbt_project.yml` is an *optional parameter*. If used, the version must be in a [semantic version](https://semver.org/) format, such as `1.0.0`. The default value is `None` if not specified. For users on dbt version 1.4 or lower, this tag is required, though it isn't currently used meaningfully by dbt.
For more on Core versions, see [About dbt Core versions](/docs/dbt-versions/core).
+
```yml
@@ -24,9 +31,9 @@ version: version
A version tag in a `.yml` property file provides the control tag, which informs how dbt processes property files.
-Starting from version 1.5, dbt will no longer require this configuration in your resource `.yml` files. If you want to know more about why this tag was previously required, you can refer to the [property file FAQs](reference/configs-and-properties#faqs).
+Starting from version 1.5, dbt will no longer require this configuration in your resource `.yml` files. If you want to know more about why this tag was previously required, you can refer to the [FAQs](#faqs). For users on dbt version 1.4 or lower, this tag is required,
-For more on property files, see their general [documentation](reference/configs-and-properties#where-can-i-define-properties) on the same page.
+For more on property files, see their general [documentation](/reference/configs-and-properties#where-can-i-define-properties) on the same page.
+## FAQS
-
-
-
-
-dbt projects have two distinct types of `version` tags. This field has a different meaning depending on its location.
-
-## `dbt_project.yml` versions
-
-The version tag in a `dbt_project` file represents the version of your dbt project and **is a required parameter**. However, it isn't currently used in a meaningful way by dbt. The version must follow a [semantic version](https://semver.org/) format, such as 1.0.0. For more information about dbt Core versions, refer to [About dbt Core versions](/docs/dbt-versions/core).
-
-
-```yml
-version: version
-```
-
-
-
-## `.yml` property file versions
-
-A version tag in a `.yml` property file provides the control tag, which informs how dbt processes property files. For more on why we require this tag, see property file [FAQs](reference/configs-and-properties#faqs).
-
-For more on property files, see their general [documentation](reference/configs-and-properties#where-can-i-define-properties) on the same page.
-
-
-
-```yml
-version: 2 # Only 2 is accepted by current and recent versions of dbt.
-
-models:
- ...
-```
-
-
-
-
+
diff --git a/website/docs/reference/references-overview.md b/website/docs/reference/references-overview.md
index 16afd01607c..91a228b6c3e 100644
--- a/website/docs/reference/references-overview.md
+++ b/website/docs/reference/references-overview.md
@@ -4,6 +4,8 @@ id: "references-overview"
sidebar_label: "About References"
description: "Connect dbt to any data platform in dbt Cloud or dbt Core, using a dedicated adapter plugin"
hide_table_of_contents: true
+pagination_next: null
+pagination_prev: null
---
The References section contains reference materials for developing with dbt, which includes dbt Cloud and dbt Core.
@@ -49,9 +51,27 @@ Learn how to add more configurations to your dbt project or adapter, use propert
icon="computer"/>
+
+
+
+
+
+
diff --git a/website/docs/reference/resource-configs/access.md b/website/docs/reference/resource-configs/access.md
new file mode 100644
index 00000000000..da50e48d2f0
--- /dev/null
+++ b/website/docs/reference/resource-configs/access.md
@@ -0,0 +1,97 @@
+---
+resource_types: [models]
+datatype: access
+---
+
+
+
+```yml
+version: 2
+
+models:
+ - name: model_name
+ access: private | protected | public
+```
+
+
+
+
+
+Access modifiers may be applied to models one-by-one in YAML properties. In v1.5 and v1.6, you are unable to configure `access` for multiple models at once. Upgrade to v1.7 for additional configuration options. A group or subfolder contains models with varying access levels, so when you designate a model with `access: public`, make sure you intend for this behavior.
+
+
+
+
+
+You can apply access modifiers in config files, including `the dbt_project.yml`, or to models one-by-one in YAML properties. Applying access configs to a subfolder modifies the default for all models in that subfolder, so make sure you intend for this behavior. When setting individual model access, a group or subfolder might contain a variety of access levels, so when you designate a model with `access: public` make sure you intend for this behavior.
+
+There are multiple approaches to configuring access:
+
+In the model configs of `dbt_project.yml``:
+
+```yaml
+models:
+ - name: my_public_model
+ access: public # Older method, still supported
+
+```
+Or (but not both)
+
+```yaml
+models:
+ - name: my_public_model
+ config:
+ access: public # newly supported in v1.7
+
+```
+
+In a subfolder:
+```yaml
+models:
+ my_project_name:
+ subfolder_name:
+ +group:
+ +access: private # sets default for all models in this subfolder
+```
+
+In the model.sql file:
+
+```sql
+-- models/my_public_model.sql
+
+{{ config(access = "public") }}
+
+select ...
+```
+
+
+
+## Definition
+The access level of the model you are declaring properties for.
+
+Some models (not all) are designed to be referenced through the [ref](/reference/dbt-jinja-functions/ref) function across [groups](/docs/build/groups).
+
+| Access | Referenceable by |
+|-----------|-------------------------------|
+| private | same group |
+| protected | same project/package |
+| public | any group, package or project |
+
+If you try to reference a model outside of its supported access, you will see an error:
+
+```shell
+dbt run -s marketing_model
+...
+dbt.exceptions.DbtReferenceError: Parsing Error
+ Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model,
+ which is not allowed because the referenced node is private to the finance group.
+```
+
+## Default
+
+By default, all models are "protected." This means that other models in the same project can reference them.
+
+## Related docs
+
+* [Model Access](/docs/collaborate/govern/model-access#groups)
+* [Group configuration](/reference/resource-configs/group)
diff --git a/website/docs/reference/resource-configs/alias.md b/website/docs/reference/resource-configs/alias.md
index 40da45ebcd1..6b7588ecaf7 100644
--- a/website/docs/reference/resource-configs/alias.md
+++ b/website/docs/reference/resource-configs/alias.md
@@ -1,33 +1,50 @@
---
resource_types: [models, seeds, snapshots, tests]
-description: "Read this guide to understand the alias configuration in dbt."
+description: "Aliasing a resource lets you give it a custom name in the database instead of using the filename."
datatype: string
---
-:::caution Heads up!
-This is a work in progress document. While this configuration applies to multiple resource types, the documentation has only been written for seeds.
-:::
+
+
-## Definition
+Specify a custom alias for a model in your `dbt_project.yml` file or config block.
-Optionally specify a custom alias for a [model](/docs/build/models) or [seed](/docs/build/seeds).
+For example, if you have a model that calculates `sales_total` and want to give it a more user-friendly alias, you can alias it like this:
-When dbt creates a relation ( / ) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments`
+
-The standard behavior of dbt is:
-* If a custom alias is _not_ specified, the identifier of the relation is the resource name (i.e. the filename).
-* If a custom alias is specified, the identifier of the relation is the `{{ alias }}` value.
+```yml
+models:
+ your_project:
+ sales_total:
+ +alias: sales_dashboard
+```
+
-To learn more about changing the way that dbt generates a relation's `identifier`, read [Using Aliases](/docs/build/custom-aliases).
+This would return `analytics.finance.sales_dashboard` in the database, instead of the default `analytics.finance.sales_total`.
+
+
+
-## Usage
-### Seeds
-Configure a seed's alias in your `dbt_project.yml` file.
+Configure a seed's alias in your `dbt_project.yml` file or config block.
-The seed at `seeds/country_codes.csv` will be built as a named `country_mappings`.
+For example, if you have a seed that represents `product_categories` and want to alias it as `categories_data`, you would alias like this:
+
+
+
+```yml
+seeds:
+ your_project:
+ product_categories:
+ +alias: categories_data
+```
+
+This would return the name `analytics.finance.categories_data` in the database.
+
+In the following second example, the seed at `seeds/country_codes.csv` will be built as a named `country_mappings`.
@@ -40,3 +57,68 @@ seeds:
```
+
+
+
+
+
+
+Configure a seed's alias in your `dbt_project.yml` file or config block.
+
+For example, if you have a snapshot that represents `your_snapshot` and want to alias it as `updated_at_id`, you would alias like this:
+
+
+
+```yml
+snapshots:
+ - name: your_snapshot
+ config:
+ target_database: analytics
+ target_schema: finance
+ unique_key: id
+ strategy: timestamp
+ updated_at: updated_at
+ alias: your_snapshot
+```
+
+This would return the name `analytics.finance.your_snapshot` in the database.
+
+
+
+
+
+
+Configure a test's alias in your `schema.yml` file or config block.
+
+For example, to add a unique test to the `order_id` column and give it an alias `unique_order_id_test` to identify this specific test, you would alias like this:
+
+
+
+```yml
+models:
+ - name: orders
+ columns:
+ - name: order_id
+ tests:
+ - unique
+ alias: unique_order_id_test
+```
+
+When using `--store-failures`, this would return the name `analytics.finance.orders_order_id_unique_order_id_test` in the database.
+
+
+
+
+
+## Definition
+
+Optionally specify a custom alias for a [model](/docs/build/models), [tests](/docs/build/tests), [snapshots](/docs/build/snapshots), or [seed](/docs/build/seeds).
+
+When dbt creates a relation ( / ) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments`
+
+The standard behavior of dbt is:
+* If a custom alias is _not_ specified, the identifier of the relation is the resource name (i.e. the filename).
+* If a custom alias is specified, the identifier of the relation is the `{{ alias }}` value.
+
+To learn more about changing the way that dbt generates a relation's `identifier`, read [Using Aliases](/docs/build/custom-aliases).
+
diff --git a/website/docs/reference/resource-configs/bigquery-configs.md b/website/docs/reference/resource-configs/bigquery-configs.md
index c425fd5b94b..ffbaa37c059 100644
--- a/website/docs/reference/resource-configs/bigquery-configs.md
+++ b/website/docs/reference/resource-configs/bigquery-configs.md
@@ -21,26 +21,6 @@ This will allow you to read and write from multiple BigQuery projects. Same for
### Partition clause
-
-
-Before dbt v0.16.0, the `partition_by` configuration was supplied as string. While
-the string specification syntax is still supported in dbt v0.16.0, it has been
-deprecated and will be removed in a future release. **Note:** partitioning configs
-using a range bucket *must* be supplied using the dictionary-style configuration as of
-dbt v0.16.0.
-
-Example usage for versions of dbt < 0.16.0:
-
-```sql
--- Partitioning by a timestamp field
-{{ config( materialized='table', partition_by="date(created_at)" ) }}
-
--- Partitioning by a date field
-{{ config( materialized='table', partition_by="created_date" ) }}
-```
-
-
-
BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#pruning_limiting_partitions) when partitions are filtered using literal values (so selecting partitions using a won't improve performance).
The `partition_by` config can be supplied as a dictionary with the following format:
@@ -61,7 +41,6 @@ The `partition_by` config can be supplied as a dictionary with the following for
```
#### Partitioning by a date or timestamp
-Partitioning by hour, month or year is new in v0.19.0
When using a `datetime` or `timestamp` column to partition data, you can create partitions with a granularity of hour, day, month, or year. A `date` column supports granularity of day, month and year. Daily partitioning is the default for all column types.
@@ -266,12 +245,6 @@ as (
#### Additional partition configs
-
-
- - **v0.20.0:** Introduced `require_partition_filter` and `partition_expiration_days`
-
-
-
If your model has `partition_by` configured, you may optionally specify two additional configurations:
- `require_partition_filter` (boolean): If set to `true`, anyone querying this model _must_ specify a partition filter, otherwise their query will fail. This is recommended for very large tables with obvious partitioning schemes, such as event streams grouped by day. Note that this will affect other dbt models or tests that try to select from this model, too.
@@ -367,11 +340,7 @@ dbt supports the specification of BigQuery labels for the tables and BigQuery key-value pair entries for labels larger than 63 characters are truncated.
**Configuring labels in a model file**
@@ -445,7 +414,7 @@ models:
columns:
- name: field
policy_tags:
- - 'projects//locations//taxonomies//policyTags/'
+ - 'projects//locations//taxonomies//policyTags/'
```
@@ -489,12 +458,6 @@ strategy is selected.
### The `insert_overwrite` strategy
-
-
- - **v0.16.0:** Introduced `insert_overwrite` incremental strategy
-
-
-
The `insert_overwrite` strategy generates a merge statement that replaces entire partitions
in the destination table. **Note:** this configuration requires that the model is configured
with a [Partition clause](#partition-clause). The `merge` statement that dbt generates
@@ -587,12 +550,6 @@ _today_ and _yesterday_ every day that it is run. It is the fastest and cheapest
way to incrementally update a table using dbt. If we wanted this to run more dynamically—
let’s say, always for the past 3 days—we could leverage dbt’s baked-in [datetime macros](https://github.com/dbt-labs/dbt-core/blob/dev/octavius-catto/core/dbt/include/global_project/macros/etc/datetime.sql) and write a few of our own.
-
-
- - **v0.19.0:** With the advent of truncated timestamp partitions in BigQuery, `timestamp`-type partitions are now treated as timestamps instead of dates for the purposes of filtering. Update `partitions_to_replace` accordingly.
-
-
-
Think of this as "full control" mode. You must ensure that expressions or literal values in the the `partitions` config have proper quoting when templated, and that they match the `partition_by.data_type` (`timestamp`, `datetime`, `date`, or `int64`). Otherwise, the filter in the incremental `merge` statement will raise an error.
#### Dynamic partitions
@@ -685,7 +642,6 @@ from {{ ref('events') }}
## Controlling table expiration
-New in v0.18.0
By default, dbt-created tables never expire. You can configure certain model(s)
to expire after a set number of hours by setting `hours_to_expiration`.
@@ -721,8 +677,6 @@ select ...
## Authorized Views
-New in v0.18.0
-
If the `grant_access_to` config is specified for a model materialized as a
view, dbt will grant the view model access to select from the list of datasets
provided. See [BQ docs on authorized views](https://cloud.google.com/bigquery/docs/share-access-views)
@@ -764,48 +718,3 @@ Views with this configuration will be able to select from objects in `project_1.
The `grant_access_to` config is not thread-safe when multiple views need to be authorized for the same dataset. The initial `dbt run` operation after a new `grant_access_to` config is added should therefore be executed in a single thread. Subsequent runs using the same configuration will not attempt to re-apply existing access grants, and can make use of multiple threads.
-
-
-
-## Materialized view
-
-The BigQuery adapter supports [materialized views](https://cloud.google.com/bigquery/docs/materialized-views-intro) and refreshes them for every subsequent `dbt run` you execute. For more information, see [Refresh Materialized Views](https://cloud.google.com/bigquery/docs/materialized-views-manage#refresh) in the Google docs.
-
-Materialized views support the optional configuration `on_configuration_change` with the following values:
-- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The following changes can be applied without the need to rebuild the materialized view:
- - enable_refresh
- - refresh_interval_minutes
- - max_staleness
-- `skip` — allows runs to continue while also providing a warning that the model was skipped
-- `fail` — forces runs to fail if a change is detected in a materialized view
-
-You can create a materialized view by editing _one_ of these files:
-- the SQL file for your model
-- the `dbt_project.yml` configuration file
-
-The following examples create a materialized view:
-
-
-
-```sql
-{{
- config(
- materialized = 'materialized_view',
- on_configuration_change = 'apply',
- )
-}}
-```
-
-
-
-
-
-
-```yaml
-models:
- path:
- materialized: materialized_view
-```
-
-
-
diff --git a/website/docs/reference/resource-configs/contract.md b/website/docs/reference/resource-configs/contract.md
index 66072fc8b89..ccc10099a12 100644
--- a/website/docs/reference/resource-configs/contract.md
+++ b/website/docs/reference/resource-configs/contract.md
@@ -23,11 +23,34 @@ When the `contract` configuration is enforced, dbt will ensure that your model's
This is to ensure that the people querying your model downstream—both inside and outside dbt—have a predictable and consistent set of columns to use in their analyses. Even a subtle change in data type, such as from `boolean` (`true`/`false`) to `integer` (`0`/`1`), could cause queries to fail in surprising ways.
+
+
The `data_type` defined in your YAML file must match a data type your data platform recognizes. dbt does not do any type aliasing itself. If your data platform recognizes both `int` and `integer` as corresponding to the same type, then they will return a match.
-When dbt is comparing data types, it will not compare granular details such as size, precision, or scale. We don't think you should sweat the difference between `varchar(256)` and `varchar(257)`, because it doesn't really affect the experience of downstream queriers. If you need a more-precise assertion, it's always possible to accomplish by [writing or using a custom test](/guides/best-practices/writing-custom-generic-tests).
+
+
+
+
+dbt uses built-in type aliasing for the `data_type` defined in your YAML. For example, you can specify `string` in your contract, and on Postgres/Redshift, dbt will convert it to `text`. If dbt doesn't recognize the `data_type` name among its known aliases, it will pass it through as-is. This is enabled by default, but you can opt-out by setting `alias_types` to `false`.
+
+Example for disabling:
+
+```yml
+
+models:
+ - name: my_model
+ config:
+ contract:
+ enforced: true
+ alias_types: false # true by default
+
+```
+
+
+
+When dbt compares data types, it will not compare granular details such as size, precision, or scale. We don't think you should sweat the difference between `varchar(256)` and `varchar(257)`, because it doesn't really affect the experience of downstream queriers. You can accomplish a more-precise assertion by [writing or using a custom test](/best-practices/writing-custom-generic-tests).
-That said, on certain data platforms, you will need to specify a varchar size or numeric scale if you do not want it to revert to the default. This is most relevant for the `numeric` type on Snowflake, which defaults to a precision of 38 and a scale of 0 (zero digits after the decimal, such as rounded to an integer). To avoid this implicit coercion, specify your `data_type` with a nonzero scale, like `numeric(38, 6)`.
+Note that you need to specify a varchar size or numeric scale, otherwise dbt relies on default values. For example, if a `numeric` type defaults to a precision of 38 and a scale of 0, then the numeric column stores 0 digits to the right of the decimal (it only stores whole numbers), which might cause it to fail contract enforcement. To avoid this implicit coercion, specify your `data_type` with a nonzero scale, like `numeric(38, 6)`. dbt Core 1.7 and higher provides a warning if you don't specify precision and scale when providing a numeric data type.
## Example
@@ -47,6 +70,8 @@ models:
- type: not_null
- name: customer_name
data_type: string
+ - name: non_integer
+ data_type: numeric(38,3)
```
@@ -95,32 +120,3 @@ Imagine:
- The result is a delta between the yaml-defined contract, and the actual table in the database - which means the contract is now incorrect!
Why `append_new_columns`, rather than `sync_all_columns`? Because removing existing columns is a breaking change for contracted models!
-
-### Detecting breaking changes
-
-When you use the `state:modified` selection method in Slim CI, dbt will detect changes to model contracts, and raise an error if any of those changes could be breaking for downstream consumers.
-
-Breaking changes include:
-- Removing an existing column
-- Changing the `data_type` of an existing column
-- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher)
-
-```
-Breaking Change to Contract Error in model sometable (models/sometable.sql)
- While comparing to previous project state, dbt detected a breaking change to an enforced contract.
-
- The contract's enforcement has been disabled.
-
- Columns were removed:
- - order_name
-
- Columns with data_type changes:
- - order_id (number -> int)
-
- Consider making an additive (non-breaking) change instead, if possible.
- Otherwise, create a new model version: https://docs.getdbt.com/docs/collaborate/govern/model-versions
-```
-
-Additive changes are **not** considered breaking:
-- Adding a new column to a contracted model
-- Adding new `constraints` to an existing column in a contracted model
diff --git a/website/docs/reference/resource-configs/database.md b/website/docs/reference/resource-configs/database.md
index 9b7cd1b39d3..7d91358ff01 100644
--- a/website/docs/reference/resource-configs/database.md
+++ b/website/docs/reference/resource-configs/database.md
@@ -2,45 +2,86 @@
sidebar_label: "database"
resource_types: [models, seeds, tests]
datatype: string
-description: "Read this guide to understand the database configuration in dbt."
+description: "Override the default database when dbt creates resources in your data platform."
---
-:::caution Heads up!
-This is a work in progress document. While this configuration applies to multiple resource types, the documentation has only been written for seeds.
+
+
-:::
+Specify a custom database for a model in your `dbt_project.yml` file.
-## Definition
+For example, if you have a model that you want to load into a database other than the target database, you can configure it like this:
-Optionally specify a custom database for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)).
+
-When dbt creates a relation ( / ) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments`
+```yml
+models:
+ your_project:
+ sales_metrics:
+ +database: reporting
+```
+
-The standard behavior of dbt is:
-* If a custom database is _not_ specified, the database of the relation is the target database (`{{ target.database }}`).
-* If a custom database is specified, the database of the relation is the `{{ database }}` value.
+This would result in the generated relation being located in the `reporting` database, so the full relation name would be `reporting.finance.sales_metrics` instead of the default target database.
+
-To learn more about changing the way that dbt generates a relation's `database`, read [Using Custom Databases](/docs/build/custom-databases)
+
-
+Configure a database in your `dbt_project.yml` file.
-* `v0.13.0`: Support for the `database` config is added
-* `v0.16.0`: The `generate_database_name` macro was added to control how the `database` config is used by dbt
+For example, to load a seed into a database called `staging` instead of the target database, you can configure it like this:
-
-
-## Usage
-### Load seeds into the RAW database
```yml
seeds:
- +database: RAW
+ your_project:
+ product_categories:
+ +database: staging
+```
+
+This would result in the generated relation being located in the `staging` database, so the full relation name would be `staging.finance.product_categories`.
+
+
+
+
+
+Configure a database in your `dbt_project.yml` file.
+
+For example, to load a test into a database called `reporting` instead of the target database, you can configure it like this:
+
+
+
+```yml
+tests:
+ - my_not_null_test:
+ column_name: order_id
+ type: not_null
+ +database: reporting
```
+This would result in the generated relation being located in the `reporting` database, so the full relation name would be `reporting.finance.my_not_null_test`.
+
+
+
+
+
+## Definition
+
+Optionally specify a custom database for a [model](/docs/build/sql-models), [seed](/docs/build/seeds), or [tests](/docs/build/tests). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)).
+
+When dbt creates a relation ( / ) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments`
+
+The standard behavior of dbt is:
+* If a custom database is _not_ specified, the database of the relation is the target database (`{{ target.database }}`).
+* If a custom database is specified, the database of the relation is the `{{ database }}` value.
+
+To learn more about changing the way that dbt generates a relation's `database`, read [Using Custom Databases](/docs/build/custom-databases)
+
+
## Warehouse specific information
* BigQuery: `project` and `database` are interchangeable
-* Redshift: Cross-database queries are not possible in Redshift. As such, dbt will return a Database Error if you use 'Database A' for a seed file and try to `{{ ref() }}` that seed file (or its database object) in a model using 'Database B'. This error would only be found at runtime.
+
diff --git a/website/docs/reference/resource-configs/databricks-configs.md b/website/docs/reference/resource-configs/databricks-configs.md
index 41b0bfcc5ea..65c6607cdcd 100644
--- a/website/docs/reference/resource-configs/databricks-configs.md
+++ b/website/docs/reference/resource-configs/databricks-configs.md
@@ -7,20 +7,41 @@ id: "databricks-configs"
When materializing a model as `table`, you may include several optional configs that are specific to the dbt-databricks plugin, in addition to the standard [model configs](/reference/model-configs).
-| Option | Description | Required? | Example |
-|---------|------------------------------------------------------------------------------------------------------------------------------------|-------------------------|--------------------------|
-| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta`|
-| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` |
-| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` |
-| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` |
-| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` |
+
+
+| Option | Description | Required? | Example |
+|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|----------------|
+| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta` |
+| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` |
+| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` |
+| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | `date_day` |
+| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` |
+| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` |
+
+
+
+
+
+
+| Option | Description | Required? | Model Support | Example |
+|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|---------------|----------------|
+| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` |
+| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` |
+| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` |
+| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL | `date_day` |
+| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` |
+| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` |
+
+
+
## Incremental models
-dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values:
+dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of four values:
- **`append`** (default): Insert new records without updating or overwriting any existing data.
- **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the with new data. If no `partition_by` is specified, overwrite the entire table with new data.
- - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`; update old records, insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.)
+ - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`, updating old records, and inserting new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.)
+ - **`replace_where`** (Delta file format only): Match records based on `incremental_predicates`, replacing all records that match the predicates from the existing table with records matching the predicates from the new data. (If no `incremental_predicates` are specified, all new data is inserted, similar to `append`.)
Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block.
@@ -120,7 +141,7 @@ select
date_day,
count(*) as users
-from events
+from new_events
group by 1
```
@@ -247,6 +268,96 @@ merge into analytics.merge_incremental as DBT_INTERNAL_DEST
+### The `replace_where` strategy
+
+The `replace_where` incremental strategy requires:
+- `file_format: delta`
+- Databricks Runtime 12.0 and above
+
+dbt will run an [atomic `replace where` statement](https://docs.databricks.com/en/delta/selective-overwrite.html#arbitrary-selective-overwrite-with-replacewhere) which selectively overwrites data matching one or more `incremental_predicates` specified as a string or array. Only rows matching the predicates will be inserted. If no `incremental_predicates` are specified, dbt will perform an atomic insert, as with `append`.
+
+:::caution
+
+`replace_where` inserts data into columns in the order provided, rather than by column name. If you reorder columns and the data is compatible with the existing schema, you may silently insert values into an unexpected column. If the incoming data is incompatible with the existing schema, you will instead receive an error.
+
+:::
+
+
+
+
+
+
+```sql
+{{ config(
+ materialized='incremental',
+ file_format='delta',
+ incremental_strategy = 'replace_where'
+ incremental_predicates = 'user_id >= 10000' # Never replace users with ids < 10000
+) }}
+
+with new_events as (
+
+ select * from {{ ref('events') }}
+
+ {% if is_incremental() %}
+ where date_day >= date_add(current_date, -1)
+ {% endif %}
+
+)
+
+select
+ user_id,
+ max(date_day) as last_seen
+
+from events
+group by 1
+```
+
+
+
+
+
+
+
+```sql
+create temporary view replace_where__dbt_tmp as
+
+ with new_events as (
+
+ select * from analytics.events
+
+
+ where date_day >= date_add(current_date, -1)
+
+
+ )
+
+ select
+ user_id,
+ max(date_day) as last_seen
+
+ from events
+ group by 1
+
+;
+
+insert into analytics.replace_where_incremental
+ replace where user_id >= 10000
+ table `replace_where__dbt_tmp`
+```
+
+
+
+
+
+
+
## Persisting model descriptions
Relation-level docs persistence is supported in dbt v0.17.0. For more
@@ -280,3 +391,73 @@ snapshots:
```
+
+
+
+## Materialized views and streaming tables
+
+Starting with version 1.6.0, the dbt-databricks adapter supports [materialized views](https://docs.databricks.com/en/sql/user/materialized-views.html) and [streaming tables](https://docs.databricks.com/en/sql/load-data-streaming-table.html), as alternatives to incremental tables that are powered by [Delta Live Tables](https://docs.databricks.com/en/delta-live-tables/index.html).
+See [What are Delta Live Tables?](https://docs.databricks.com/en/delta-live-tables/index.html#what-are-delta-live-tables-datasets) for more information and use cases.
+These features are still in preview, and the support in the dbt-databricks adapter should, for now, be considered _experimental_.
+In order to adopt these materialization strategies, you will need a workspace that is enabled for Unity Catalog and serverless SQL Warehouses.
+
+
+
+```sql
+{{ config(
+ materialized = 'materialized_view'
+ ) }}
+```
+
+
+
+or
+
+
+
+```sql
+{{ config(
+ materialized = 'streaming_table'
+ ) }}
+```
+
+
+
+When dbt detects a pre-existing relation of one of these types, it issues a `REFRESH` [command](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-refresh-full.html).
+
+### Limitations
+
+As mentioned above, support for these materializations in the Databricks adapter is still limited.
+At this time the following configuration options are not available:
+
+* Specifying a refresh schedule for these materializations
+* Specifying `on_configuration_change` settings.
+
+Additionally, if you change the model definition of your materialized view or streaming table, you will need to drop the materialization in your warehouse directly before running dbt again; otherwise, you will get a refresh error.
+
+We plan to address these limitations during the 1.7.x timeframe.
+
+
+## Setting table properties
+[Table properties](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-tblproperties.html) can be set with your configuration for tables or views using `tblproperties`:
+
+
+
+```sql
+{{ config(
+ tblproperties={
+ 'delta.autoOptimize.optimizeWrite' : 'true',
+ 'delta.autoOptimize.autoCompact' : 'true'
+ }
+ ) }}
+```
+
+
+
+:::caution
+
+These properties are sent directly to Databricks without validation in dbt, so be thoughtful with how you use this feature. You will need to do a full refresh of incremental materializations if you change their `tblproperties`.
+
+:::
+
+One application of this feature is making `delta` tables compatible with `iceberg` readers using the [Universal Format](https://docs.databricks.com/en/delta/uniform.html).
diff --git a/website/docs/reference/resource-configs/delimiter.md b/website/docs/reference/resource-configs/delimiter.md
new file mode 100644
index 00000000000..58d6ba8344a
--- /dev/null
+++ b/website/docs/reference/resource-configs/delimiter.md
@@ -0,0 +1,126 @@
+---
+resource_types: [seeds]
+datatype:
+default_value: ","
+---
+
+## Definition
+
+You can use this optional seed configuration to customize how you separate values in a [seed](/docs/build/seeds) by providing the one-character string.
+
+* The delimiter defaults to a comma when not specified.
+* Explicitly set the `delimiter` configuration value if you want seed files to use a different delimiter, such as "|" or ";".
+
+:::info New in 1.7!
+
+Delimiter is new functionality available beginning with dbt Core v1.7.
+
+:::
+
+
+## Usage
+
+Specify a delimiter in your `dbt_project.yml` file to customize the global separator for all seed values:
+
+
+
+```yml
+seeds:
+ :
+ +delimiter: "|" # default project delimiter for seeds will be "|"
+ :
+ +delimiter: "," # delimiter for seeds in seed_subdirectory will be ","
+```
+
+
+
+
+Or use a custom delimiter to override the values for a specific seed:
+
+
+
+```yml
+version: 2
+
+seeds:
+ - name:
+ config:
+ delimiter: "|"
+```
+
+
+
+## Examples
+For a project with:
+
+* `name: jaffle_shop` in the `dbt_project.yml` file
+* `seed-paths: ["seeds"]` in the `dbt_project.yml` file
+
+### Use a custom delimiter to override global values
+
+You can set a default behavior for all seeds with an exception for one seed, `seed_a`, which uses a comma:
+
+
+
+```yml
+seeds:
+ jaffle_shop:
+ +delimiter: "|" # default delimiter for seeds in jaffle_shop project will be "|"
+ seed_a:
+ +delimiter: "," # delimiter for seed_a will be ","
+```
+
+
+
+Your corresponding seed files would be formatted like this:
+
+
+
+```text
+col_a|col_b|col_c
+1|2|3
+4|5|6
+...
+```
+
+
+
+
+
+```text
+name,id
+luna,1
+doug,2
+...
+```
+
+
+
+Or you can configure custom behavior for one seed. The `country_codes` uses the ";" delimiter:
+
+
+
+```yml
+version: 2
+
+seeds:
+ - name: country_codes
+ config:
+ delimiter: ";"
+```
+
+
+
+The `country_codes` seed file would be formatted like this:
+
+
+
+```text
+country_code;country_name
+US;United States
+CA;Canada
+GB;United Kingdom
+...
+```
+
+
diff --git a/website/docs/reference/resource-configs/docs.md b/website/docs/reference/resource-configs/docs.md
index d94b975683d..c5e35dd64f4 100644
--- a/website/docs/reference/resource-configs/docs.md
+++ b/website/docs/reference/resource-configs/docs.md
@@ -17,10 +17,12 @@ default_value: {show: true}
{ label: 'Macros', value: 'macros', },
]
}>
+
+
```yml
version: 2
@@ -28,7 +30,7 @@ models:
- name: model_name
docs:
show: true | false
-
+ node_color: "black"
```
@@ -52,9 +54,7 @@ seeds:
- name: seed_name
docs:
show: true | false
-
```
-
@@ -70,9 +70,7 @@ snapshots:
- name: snapshot_name
docs:
show: true | false
-
```
-
@@ -89,7 +87,6 @@ analyses:
docs:
show: true | false
```
-
@@ -109,26 +106,20 @@ macros:
- name: macro_name
docs:
show: true | false
-
```
-
+Also refer to [macro properties](/reference/macro-properties).
+
## Definition
-The docs field can be used to provide documentation-specific configuration to models. The only currently supported docs attribute is shown, which controls whether or not models are shown in the auto-generated documentation website.
+The docs field can be used to provide documentation-specific configuration to models. It supports the doc attribute `show`, which controls whether or not models are shown in the auto-generated documentation website. It also supports `node_color` for some node types.
**Note:** hidden models will still appear in the dbt DAG visualization but will be identified as "hidden.”
-
-
-* `v0.16.0`: This property was added
-
-
-
## Default
The default value for `show` is `true`.
@@ -173,7 +164,7 @@ models:
## Custom node colors
-The `docs` attribute now supports `node_color` to customize the node color in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed.
+The `docs` attribute now supports `node_color` to customize the display color of some node types in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed.
`node_color` hiearchy:
@@ -182,7 +173,7 @@ The `docs` attribute now supports `node_color` to customize the node color in th
## Examples
-Add custom node colors to models within subdirectories based on hex codes or a plain color name.
+Add custom `node_colors` to models that support it within subdirectories based on hex codes or a plain color name.
![Example](../../../../website/static/img/node_color_example.png)
diff --git a/website/docs/reference/resource-configs/enabled.md b/website/docs/reference/resource-configs/enabled.md
index 03d1598c931..552777c5c81 100644
--- a/website/docs/reference/resource-configs/enabled.md
+++ b/website/docs/reference/resource-configs/enabled.md
@@ -15,10 +15,22 @@ default_value: true
{ label: 'Sources', value: 'sources', },
{ label: 'Metrics', value: 'metrics', },
{ label: 'Exposures', value: 'exposures', },
+ { label: 'Semantic models', value: 'semantic models', },
]
}>
+
+
+```yml
+models:
+ [](/reference/resource-configs/resource-path):
+ +enabled: true | false
+
+```
+
+
+
```sql
@@ -34,10 +46,15 @@ select ...
+
+
+
+
+
```yml
-models:
+seeds:
[](/reference/resource-configs/resource-path):
+enabled: true | false
@@ -47,13 +64,12 @@ models:
-
-
+
```yml
-seeds:
+snapshots:
[](/reference/resource-configs/resource-path):
+enabled: true | false
@@ -61,10 +77,6 @@ seeds:
-
-
-
-
```sql
@@ -82,10 +94,14 @@ select ...
+
+
+
+
```yml
-snapshots:
+tests:
[](/reference/resource-configs/resource-path):
+enabled: true | false
@@ -93,10 +109,6 @@ snapshots:
-
-
-
-
```sql
@@ -124,17 +136,6 @@ select ...
-
-
-```yml
-tests:
- [](/reference/resource-configs/resource-path):
- +enabled: true | false
-
-```
-
-
-
@@ -150,7 +151,6 @@ sources:
-
@@ -170,7 +170,6 @@ sources:
-
@@ -252,10 +251,45 @@ exposures:
+
+
+
+
+Support for disabling semantic models has been added in dbt Core v1.7
+
+
+
+
+
+
+
+```yaml
+semantic-models:
+ [](/reference/resource-configs/resource-path):
+ [+](/reference/resource-configs/plus-prefix)enabled: true | false
+```
+
+
+
+
+
+```yaml
+semantic_models:
+ - name: []
+ [config](/reference/resource-properties/config):
+ enabled: true | false
+```
+
+
+
+
+
+
+
## Definition
-An optional configuration for disabling models, seeds, snapshots, and tests.
+An optional configuration for enabling or disabling a resource.
* Default: true
diff --git a/website/docs/reference/resource-configs/grants.md b/website/docs/reference/resource-configs/grants.md
index 8ef726788dc..3a65672fa5e 100644
--- a/website/docs/reference/resource-configs/grants.md
+++ b/website/docs/reference/resource-configs/grants.md
@@ -121,7 +121,7 @@ For example:
```yml
models:
- +grants:
+ +grants: # In this case the + is not optional, you must include it for your project to parse.
select: ['user_a', 'user_b']
```
@@ -243,6 +243,7 @@ models:
- Databricks automatically enables `grants` on SQL endpoints. For interactive clusters, admins should enable grant functionality using these two setup steps in the Databricks documentation:
- [Enable table access control for your workspace](https://docs.databricks.com/administration-guide/access-control/table-acl.html)
- [Enable table access control for a cluster](https://docs.databricks.com/security/access-control/table-acls/table-acl.html)
+- In order to grant `READ_METADATA` or `USAGE`, use [post-hooks](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook)
diff --git a/website/docs/reference/resource-configs/group.md b/website/docs/reference/resource-configs/group.md
index dd73d99edff..a71935013c4 100644
--- a/website/docs/reference/resource-configs/group.md
+++ b/website/docs/reference/resource-configs/group.md
@@ -16,6 +16,7 @@ This functionality is new in v1.5.
{ label: 'Tests', value: 'tests', },
{ label: 'Analyses', value: 'analyses', },
{ label: 'Metrics', value: 'metrics', },
+ { label: 'Semantic models', value: 'semantic models', },
]
}>
@@ -28,28 +29,29 @@ Support for grouping models was added in dbt Core v1.5
-
-
-
+
```yml
-version: 2
-
models:
- - name: model_name
- group: finance
+
+ [](resource-path):
+ +group: GROUP_NAME
+
```
+
-
+
```yml
+version: 2
+
models:
- [](resource-path):
- +group: finance
-```
+ - name: MODEL_NAME
+ group: GROUP
+```
@@ -58,7 +60,7 @@ models:
```sql
{{ config(
- group='finance'
+ group='GROUP_NAME'
) }}
select ...
@@ -67,6 +69,8 @@ select ...
+
+
@@ -79,14 +83,12 @@ Support for grouping seeds was added in dbt Core v1.5
-
-
```yml
models:
[](resource-path):
- +group: finance
+ +group: GROUP_NAME
```
@@ -95,12 +97,14 @@ models:
```yml
seeds:
- - name: []
- group: finance
+ - name: [SEED_NAME]
+ group: GROUP_NAME
```
+
+
@@ -114,14 +118,12 @@ Support for grouping snapshots was added in dbt Core v1.5
-
-
```yml
snapshots:
[](resource-path):
- +group: finance
+ +group: GROUP_NAME
```
@@ -132,7 +134,7 @@ snapshots:
{% snapshot [snapshot_name](snapshot_name) %}
{{ config(
- group='finance'
+ group='GROUP_NAME'
) }}
select ...
@@ -142,6 +144,8 @@ select ...
+
+
@@ -155,14 +159,12 @@ Support for grouping tests was added in dbt Core v1.5
-
-
```yml
tests:
[](resource-path):
- +group: finance
+ +group: GROUP_NAME
```
@@ -177,7 +179,7 @@ version: 2
tests:
- :
config:
- group: finance
+ group: GROUP_NAME
```
@@ -188,7 +190,7 @@ version: 2
{% test () %}
{{ config(
- group='finance'
+ group='GROUP_NAME'
) }}
select ...
@@ -203,12 +205,14 @@ select ...
```sql
{{ config(
- group='finance'
+ group='GROUP_NAME'
) }}
```
+
+
@@ -219,8 +223,8 @@ select ...
version: 2
analyses:
- - name:
- group: finance
+ - name: ANALYSIS_NAME
+ group: GROUP_NAME
```
@@ -238,14 +242,12 @@ Support for grouping metrics was added in dbt Core v1.5
-
-
```yaml
metrics:
[](resource-path):
- [+](plus-prefix)group: finance
+ [+](plus-prefix)group: GROUP_NAME
```
@@ -256,19 +258,61 @@ metrics:
version: 2
metrics:
- - name: []
- group: finance
+ - name: [METRIC_NAME]
+ group: GROUP_NAME
+
+```
+
+
+
+
+
+
+
+
+
+
+
+
+Support for grouping semantic models has been added in dbt Core v1.7.
+
+
+
+
+
+
+
+```yaml
+
+semantic-models:
+ [](resource-path):
+ [+](plus-prefix)group: GROUP_NAME
```
+
+
+```yaml
+
+semantic_models:
+ - name: SEMANTIC_MODEL_NAME
+ group: GROUP_NAME
+
+
+```
+
+
+
+
+
## Definition
-An optional configuration for grouping models, analysis, snapshots, tests, and metrics. When a resource is grouped, dbt will allow it to reference private models within the same group.
+An optional configuration for assigning a group to a resource. When a resource is grouped, dbt will allow it to reference private models within the same group.
For more details on reference access between resources in groups, check out [model access](/docs/collaborate/govern/model-access#groups).
diff --git a/website/docs/reference/resource-configs/invalidate_hard_deletes.md b/website/docs/reference/resource-configs/invalidate_hard_deletes.md
index 3e9f13b738d..ba5b37c5d71 100644
--- a/website/docs/reference/resource-configs/invalidate_hard_deletes.md
+++ b/website/docs/reference/resource-configs/invalidate_hard_deletes.md
@@ -4,7 +4,6 @@ description: "Invalidate_hard_deletes - Read this in-depth guide to learn about
datatype: column_name
---
-New in v0.19.0
```jinja2
diff --git a/website/docs/reference/resource-configs/materialize-configs.md b/website/docs/reference/resource-configs/materialize-configs.md
index 1338647a2a6..6976aa84061 100644
--- a/website/docs/reference/resource-configs/materialize-configs.md
+++ b/website/docs/reference/resource-configs/materialize-configs.md
@@ -8,11 +8,9 @@ id: "materialize-configs"
### Clusters
-
-- **v1.2.0:** Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31).
+Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31).
-
The default [cluster](https://materialize.com/docs/overview/key-concepts/#clusters) that is used to maintain materialized views or indexes can be configured in your [profile](/docs/core/connect-data-platform/profiles.yml) using the `cluster` connection parameter. To override the cluster that is used for specific models (or groups of models), use the `cluster` configuration parameter.
@@ -45,11 +43,7 @@ Materialize, at its core, is a real-time database that delivers incremental view
### Indexes
-
-
-- **v1.2.0:** Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31).
-
-
+Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31).
Like in any standard relational database, you can use [indexes](https://materialize.com/docs/overview/key-concepts/#indexes) to optimize query performance in Materialize. Improvements can be significant, reducing response times down to single-digit milliseconds.
@@ -85,12 +79,6 @@ select ...
### Tests
-
-
-- **v1.1.1:** Provide support for storing the results of a test query in a materialized view using the `store_failures` config.
-
-
-
If you set the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures), dbt will create a materialized view for each configured test that can keep track of failures over time. By default, test views are created in a schema suffixed with `dbt_test__audit`. To specify a custom suffix, use the `schema` config.
diff --git a/website/docs/reference/resource-configs/meta.md b/website/docs/reference/resource-configs/meta.md
index 18cc13ae969..bc0c0c7c041 100644
--- a/website/docs/reference/resource-configs/meta.md
+++ b/website/docs/reference/resource-configs/meta.md
@@ -4,12 +4,6 @@ datatype: "{}"
default_value: {}
---
-
-
-* `v0.21.0`: `meta` is now a config that can be set in `dbt_project.yml` and as a `config` YAML property for some resource types. It is applied hierarchically and merges on a per-key basis.
-
-
-
@@ -59,11 +55,13 @@ version: 2
sources:
- name: model_name
- meta: {}
+ config:
+ meta: {}
tables:
- name: table_name
- meta: {}
+ config:
+ meta: {}
columns:
- name: column_name
@@ -152,7 +150,6 @@ macros:
arguments:
- name: argument_name
- meta: {}
```
@@ -177,12 +174,40 @@ exposures:
+
+
+
+
+Support for grouping semantic models was added in dbt Core v1.7
+
+
+
+
+
+
+
+```yml
+semantic_models:
+ - name: semantic_model_name
+ config:
+ meta: {}
+
+```
+
+
+
+The `meta` config can also be defined under the `semantic-models` config block in `dbt_project.yml`. See [configs and properties](/reference/configs-and-properties) for details.
+
+
+
+
+
## Definition
The `meta` field can be used to set metadata for a resource. This metadata is compiled into the `manifest.json` file generated by dbt, and is viewable in the auto-generated documentation.
-Depending on the resource you're configuring, `meta` may be available within the `config` property, or as a top-level key. (For backwards compatibility, `meta` is always supported as a top-level key, though without the capabilities of config inheritance.)
+Depending on the resource you're configuring, `meta` may be available within the `config` property, and/or as a top-level key. (For backwards compatibility, `meta` is often (but not always) supported as a top-level key, though without the capabilities of config inheritance.)
## Examples
@@ -252,4 +277,20 @@ seeds:
select 1 as id
```
+
+
+### Assign owner in the dbt_project.yml as a config property
+
+
+
+```yml
+models:
+ jaffle_shop:
+ materialized: table
+ config:
+ meta:
+ owner: "@alice"
+```
+
+
diff --git a/website/docs/reference/resource-configs/no-configs.md b/website/docs/reference/resource-configs/no-configs.md
index 5a4ba4eaaa2..5eec26917c8 100644
--- a/website/docs/reference/resource-configs/no-configs.md
+++ b/website/docs/reference/resource-configs/no-configs.md
@@ -8,4 +8,4 @@ If you were guided to this page from a data platform setup article, it most like
- Setting up the profile is the only action the end-user needs to take on the data platform, or
- The subsequent actions the end-user needs to take are not currently documented
-If you'd like to contribute to data platform-specifc configuration information, refer to [Documenting a new adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter)
\ No newline at end of file
+If you'd like to contribute to data platform-specific configuration information, refer to [Documenting a new adapter](/guides/adapter-creation)
diff --git a/website/docs/reference/resource-configs/persist_docs.md b/website/docs/reference/resource-configs/persist_docs.md
index 6facf3945cb..15b1e0bdb40 100644
--- a/website/docs/reference/resource-configs/persist_docs.md
+++ b/website/docs/reference/resource-configs/persist_docs.md
@@ -112,13 +112,6 @@ column and relation comments in the database. By default, documentation
persistence is disabled, but it can be enabled for specific resources or groups of
resources as needed.
-
-
- - Support for this config on Redshift, Postgres, and Snowflake is new in 0.17.0
- - Support for column-level docs persistence is new for all databases in 0.17.0
-
-
-
## Support
The `persist_docs` config is supported on the most widely used dbt adapters:
@@ -151,12 +144,6 @@ Some known issues and limitations:
-
-
-- Column names that must be quoted, such as column names containing special characters, will cause runtime errors if column-level `persist_docs` is enabled. This is fixed in v1.2.
-
-
-
diff --git a/website/docs/reference/resource-configs/plus-prefix.md b/website/docs/reference/resource-configs/plus-prefix.md
index d8c54aa8e70..c1adbc0286a 100644
--- a/website/docs/reference/resource-configs/plus-prefix.md
+++ b/website/docs/reference/resource-configs/plus-prefix.md
@@ -5,7 +5,7 @@ title: Using the + prefix
The `+` prefix is a dbt syntax feature, introduced in dbt v0.17.0, which helps disambiguate between [resource paths](/reference/resource-configs/resource-path) and configs in `dbt_project.yml` files.
-It is only compatible with `dbt_project.yml` files that use [`config-version](/reference/project-configs/config-version): 2`
+It is not compatible with `dbt_project.yml` files that use [`config-version`](/reference/project-configs/config-version) 1.
For example:
diff --git a/website/docs/reference/resource-configs/postgres-configs.md b/website/docs/reference/resource-configs/postgres-configs.md
index eb9108ad431..97a695ee12e 100644
--- a/website/docs/reference/resource-configs/postgres-configs.md
+++ b/website/docs/reference/resource-configs/postgres-configs.md
@@ -8,20 +8,25 @@ id: "postgres-configs"
In dbt-postgres, the following incremental materialization strategies are supported:
+
+
- `append` (default)
-- `merge`
- `delete+insert`
+
-## Performance Optimizations
+
-### Unlogged
+- `append` (default)
+- `merge`
+- `delete+insert`
-
+
- - **v0.14.1:** Introduced native support for `unlogged` config
-
+## Performance optimizations
+
+### Unlogged
"Unlogged" tables can be considerably faster than ordinary tables, as they are not written to the write-ahead log nor replicated to read replicas. They are also considerably less safe than ordinary tables. See [Postgres docs](https://www.postgresql.org/docs/current/sql-createtable.html#SQL-CREATETABLE-UNLOGGED) for details.
@@ -48,13 +53,7 @@ models:
While Postgres works reasonably well for datasets smaller than about 10m rows, database tuning is sometimes required. It's important to create indexes for columns that are commonly used in joins or where clauses.
-
-
- - **v0.20.0:** Introduced native support for `indexes` config
-
-
-
-Table models, incremental models, seeds, and snapshots may have a list of `indexes` defined. Each Postgres index can have three components:
+Table models, incremental models, seeds, snapshots, and materialized views may have a list of `indexes` defined. Each Postgres index can have three components:
- `columns` (list, required): one or more columns on which the index is defined
- `unique` (boolean, optional): whether the index should be [declared unique](https://www.postgresql.org/docs/9.4/indexes-unique.html)
- `type` (string, optional): a supported [index type](https://www.postgresql.org/docs/current/indexes-types.html) (B-tree, Hash, GIN, etc)
@@ -107,45 +106,35 @@ models:
-## Materialized view
+## Materialized views
-The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html) and refreshes them for every subsequent `dbt run` you execute. For more information, see [Refresh Materialized Views](https://www.postgresql.org/docs/15/sql-refreshmaterializedview.html) in the Postgres docs.
+The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html).
+Indexes are the only configuration that is specific to `dbt-postgres`.
+The remaining configuration follows the general [materialized view](/docs/build/materializations#materialized-view) configuration.
+There are also some limitations that we hope to address in the next version.
-Materialized views support the optional configuration `on_configuration_change` with the following values:
-- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The following index action can be applied without the need to rebuild the materialized view:
- - Added
- - Dropped
- - Updated
-- `skip` — allows runs to continue while also providing a warning that the model was skipped
-- `fail` — forces runs to fail if a change is detected in a materialized view
+### Monitored configuration changes
-You can create a materialized view by editing _one_ of these files:
-- the SQL file for your model
-- the `dbt_project.yml` configuration file
+The settings below are monitored for changes applicable to `on_configuration_change`.
-The following examples create a materialized view:
+#### Indexes
-
+Index changes (`CREATE`, `DROP`) can be applied without the need to rebuild the materialized view.
+This differs from a table model, where the table needs to be dropped and re-created to update the indexes.
+If the `indexes` portion of the `config` block is updated, the changes will be detected and applied
+directly to the materialized view in place.
-```sql
-{{
- config(
- materialized = 'materialized_view',
- on_configuration_change = 'apply',
- )
-}}
-```
+### Limitations
-
+#### Changing materialization to and from "materialized_view"
+Swapping an already materialized model to a materialized view, and vice versa, is not supported.
+The workaround is to manually drop the existing materialization in the data warehouse prior to calling `dbt run`.
+Running with `--full-refresh` flag will not work to drop the existing table or view and create the materialized view (and vice versa).
+This would only need to be done once as the existing object would then be a materialized view.
-
-
-```yaml
-models:
- path:
- materialized: materialized_view
-```
-
+For example,`my_model`, has already been materialized as a table in the underlying data platform via `dbt run`.
+If the user changes the model's config to `materialized="materialized_view"`, they will get an error.
+The solution is to execute `DROP TABLE my_model` on the data warehouse before trying the model again.
diff --git a/website/docs/reference/resource-configs/pre-hook-post-hook.md b/website/docs/reference/resource-configs/pre-hook-post-hook.md
index 1660c50049b..de652bff088 100644
--- a/website/docs/reference/resource-configs/pre-hook-post-hook.md
+++ b/website/docs/reference/resource-configs/pre-hook-post-hook.md
@@ -115,13 +115,6 @@ Pre- and post-hooks can also call macros that return SQL statements. If your mac
dbt aims to provide all the boilerplate SQL you need (DDL, DML, and DCL) via out-of-the-box functionality, which you can configure quickly and concisely. In some cases, there may be SQL that you want or need to run, specific to functionality in your data platform, which dbt does not (yet) offer as a built-in feature. In those cases, you can write the exact SQL you need, using dbt's compilation context, and pass it into a `pre-` or `post-` hook to run before or after your model, seed, or snapshot.
-
-
-* `v0.12.2`: The `post_hook` alias for config blocks was introduced. Prior to this, users needed to use the alternative config syntax to apply pre- and post-hooks.
-
-
-
-
## Examples
@@ -167,69 +160,6 @@ See: [Apache Spark docs on `ANALYZE TABLE`](https://spark.apache.org/docs/latest
-
-
-### Grant privileges on a model
-
-
-
-```yml
-
-models:
- +post-hook: "grant select on {{ this }} to group reporter"
-
-```
-
-
-
-### Grant multiple privileges on a model
-
-
-
-```yml
-
-models:
- +post-hook:
- - "grant select on {{ this }} to group reporter"
- - "grant select on {{ this }} to group transformer"
-
-```
-
-
-
-### Call a macro to grant privileges on a model
-
-
-
-```yml
-
-models:
- +post-hook: "{{ grant_select(this) }}"
-
-```
-
-
-
-
-### Grant privileges on a directory of models
-
-
-
-```yml
-
-models:
- jaffle_shop: # this is the project name
- marts:
- marketing:
- # this will be applied to all models in marts/marketing/
- +post-hook: "{{ grant_select(this) }}"
-
-```
-
-
-
-
-
### Additional examples
We've compiled some more in-depth examples [here](/docs/build/hooks-operations#additional-examples).
@@ -245,13 +175,17 @@ If multiple instances of any hooks are defined, dbt will run each hook using the
### Transaction behavior
-If you're using an adapter that makes use of transactions (namely Postgres or Redshift), it's worth noting that by default hooks are executed inside of the same transaction as your model being created.
+If you're using an adapter that uses transactions (namely Postgres or Redshift), it's worth noting that by default hooks are executed inside of the same transaction as your model being created.
There may be occasions where you need to run these hooks _outside_ of a transaction, for example:
-* You want to run a `VACUUM` in a `post-hook`, however this cannot be executed within a transaction ([Redshift docs](https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html#r_VACUUM_usage_notes))
-* You want to insert a record into an audit at the start of a run, and do not want that statement rolled back if the model creation fails.
+* You want to run a `VACUUM` in a `post-hook`, however, this cannot be executed within a transaction ([Redshift docs](https://docs.aws.amazon.com/redshift/latest/dg/r_VACUUM_command.html#r_VACUUM_usage_notes))
+* You want to insert a record into an audit at the start of a run and do not want that statement rolled back if the model creation fails.
+
+To achieve this behavior, you can use one of the following syntaxes:
+ - Important note: Do not use this syntax if you are using a database where dbt does not support transactions. This includes databases like Snowflake, BigQuery, and Spark or Databricks.
-To achieve this, you can use one of the following syntaxes. (Note: You should NOT use this syntax if using a database where dbt does not use transactions by default, including Snowflake, BigQuery, and Spark/Databricks.)
+
+
#### Config block: use the `before_begin` and `after_commit` helper macros
@@ -270,6 +204,9 @@ select ...
```
+
+
+
#### Config block: use a dictionary
@@ -294,6 +231,10 @@ select ...
+
+
+
+
#### `dbt_project.yml`: Use a dictionary
@@ -312,3 +253,5 @@ models:
```
+
+
diff --git a/website/docs/reference/resource-configs/redshift-configs.md b/website/docs/reference/resource-configs/redshift-configs.md
index a0ebf7e88df..9bd127a1e1a 100644
--- a/website/docs/reference/resource-configs/redshift-configs.md
+++ b/website/docs/reference/resource-configs/redshift-configs.md
@@ -14,17 +14,28 @@ To-do:
In dbt-redshift, the following incremental materialization strategies are supported:
+
+
+- `append` (default)
+- `delete+insert`
+
+
+
+
+
- `append` (default)
- `merge`
- `delete+insert`
-All of these strategies are inheirited via from dbt-postgres.
+
+
+All of these strategies are inherited from dbt-postgres.
## Performance optimizations
### Using sortkey and distkey
-Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect for models set to `view` or `ephemeral` models.
+Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect on models set to `view` or `ephemeral` models.
- `dist` can have a setting of `all`, `even`, `auto`, or the name of a key.
- `sort` accepts a list of sort keys, for example: `['timestamp', 'userid']`. dbt will build the sort key in the same order the fields are supplied.
@@ -64,7 +75,7 @@ For more information on distkeys and sortkeys, view Amazon's docs:
- [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing a Data Distribution Style](https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html)
- [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing Sort Keys](https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html)
-## Late Binding Views
+## Late binding views
Redshift supports views unbound from their dependencies, or [late binding views](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_VIEW.html#late-binding-views). This DDL option "unbinds" a view from the data it selects from. In practice, this means that if upstream views or tables are dropped with a cascade qualifier, the late-binding view does not get dropped as well.
@@ -98,42 +109,51 @@ models:
-## Materialized view
+## Materialized views
-The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html) and refreshes them for every subsequent `dbt run` that you execute. For more information, see [Refresh Materialized Views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) in the Redshift docs.
+The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html).
+Redshift-specific configuration includes the typical `dist`, `sort_type`, `sort`, and `backup`.
+For materialized views, there is also the `auto_refresh` setting, which allows Redshift to [automatically refresh](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) the materialized view for you.
+The remaining configuration follows the general [materialized view](/docs/build/materializations#Materialized-View) configuration.
+There are also some limitations that we hope to address in the next version.
-Materialized views support the optional configuration `on_configuration_change` with the following values:
-- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The `auto_refresh` action can applied without the need to rebuild the materialized view.
-- `skip` — allows runs to continue while also providing a warning that the model was skipped
-- `fail` — forces runs to fail if a change is detected in a materialized view
+### Monitored configuration changes
-You can create a materialized view by editing _one_ of these files:
-- the SQL file for your model
-- the `dbt_project.yml` configuration file
+The settings below are monitored for changes applicable to `on_configuration_change`.
-The following examples create a materialized view:
+#### Dist
-
+Changes to `dist` will result in a full refresh of the existing materialized view (applied at the time of the next `dbt run` of the model). Redshift requires a materialized view to be
+dropped and recreated to apply a change to the `distkey` or `diststyle`.
-```sql
-{{
- config(
- materialized = 'materialized_view',
- on_configuration_change = 'apply',
- )
-}}
-```
+#### Sort type, sort
-
+Changes to `sort_type` or `sort` will result in a full refresh. Redshift requires a materialized
+view to be dropped and recreated to apply a change to the `sortkey` or `sortstyle`.
+#### Backup
-
+Changes to `backup` will result in a full refresh. Redshift requires a materialized
+view to be dropped and recreated to apply a change to the `backup` setting.
-```yaml
-models:
- path:
- materialized: materialized_view
-```
-
+#### Auto refresh
+
+The `auto_refresh` setting can be updated via an `ALTER` statement. This setting effectively toggles
+automatic refreshes on or off. The default setting for this config is off (`False`). If this
+is the only configuration change for the materialized view, dbt will choose to apply
+an `ALTER` statement instead of issuing a full refresh,
+
+### Limitations
+
+#### Changing materialization from "materialized_view" to "table" or "view"
+
+Swapping a materialized view to a table or view is not supported.
+You must manually drop the existing materialized view in the data warehouse prior to calling `dbt run`.
+Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case.
+This would only need to be done once as the existing object would then be a materialized view.
+
+For example, assume that a materialized view, `my_mv.sql`, has already been materialized to the underlying data platform via `dbt run`.
+If the user changes the model's config to `materialized="table"`, they will get an error.
+The workaround is to execute `DROP MATERIALIZED VIEW my_mv CASCADE` on the data warehouse before trying the model again.
diff --git a/website/docs/reference/resource-configs/resource-path.md b/website/docs/reference/resource-configs/resource-path.md
index 258b83dcd57..20406f26f2a 100644
--- a/website/docs/reference/resource-configs/resource-path.md
+++ b/website/docs/reference/resource-configs/resource-path.md
@@ -1,11 +1,28 @@
-The `` nomenclature is used in this documentation when documenting how to configure a model, seed, or snapshot, from your `dbt_project.yml` file. It represents the nested dictionary keys that provide the path to either a directory of models, or a single model.
+---
+title: Resource path
+description: "Learn how to use resource paths to configure resource types in dbt."
+id: resource-path
+sidebar_label: "About resource paths"
+---
+
+The `` nomenclature is used in this documentation when documenting how to configure resource types like models, seeds, snapshots, tests, sources, and others, from your `dbt_project.yml` file.
+
+It represents the nested dictionary keys that provide the path to a directory of that resource type, or a single instance of that resource type by name.
+
+```yml
+resource_type:
+ project_name:
+ directory_name:
+ subdirectory_name:
+ instance_of_resource_type (by name):
+ ...
+```
## Example
-:::info
-This example is for models, but the same concepts apply for seeds and snapshots.
+The following examples are mostly for models and a source, but the same concepts apply for seeds, snapshots, tests, sources, and other resource types.
-:::
+### Apply config to all models
To apply a configuration to all models, do not use a ``:
@@ -18,6 +35,8 @@ models:
+### Apply config to all models in your project
+
To apply a configuration to all models in _your_ project only, use your [project name](/reference/project-configs/name) as the ``:
@@ -32,6 +51,8 @@ models:
+### Apply config to all models in a subdirectory
+
To apply a configuration to all models in a subdirectory of your project, e.g. `staging`, nest the directory under the project name:
@@ -57,6 +78,8 @@ In the following project, this would apply to models in the `staging/` directory
```
+### Apply config to all models in one model
+
To apply a configuration to one model, nest the full path under the project name. For a model at `/staging/stripe/payments.sql`, this would look like:
@@ -92,3 +115,19 @@ In the following project, this would only apply to the `payments` model:
└── payments.sql
```
+### Apply config to a source nested in a subfolder
+
+To disable a source table nested in a YAML file in a subfolder, you will need to supply the subfolder(s) within the path to that YAML file, as well as the source name and the table name in the `dbt_project.yml` file.
+ The following example shows how to disable a source table nested in a YAML file in a subfolder:
+
+
+
+ ```yaml
+ sources:
+ your_project_name:
+ subdirectory_name:
+ source_name:
+ source_table_name:
+ +enabled: false
+ ```
+
diff --git a/website/docs/reference/resource-configs/schema.md b/website/docs/reference/resource-configs/schema.md
index c976bf6502a..3852ee4e639 100644
--- a/website/docs/reference/resource-configs/schema.md
+++ b/website/docs/reference/resource-configs/schema.md
@@ -1,14 +1,70 @@
---
sidebar_label: "schema"
resource_types: [models, seeds, tests]
-description: "Schema - Read this in-depth guide to learn about configurations in dbt."
+description: "Override the default schema when dbt creates resources in your data platform."
datatype: string
---
-:::caution Heads up!
-This is a work in progress document. While this configuration applies to multiple resource types, the documentation has only been written for seeds.
+
+
-:::
+Specify a custom schema for a group of models in your `dbt_project.yml` file or a [config block](/reference/resource-configs/schema#models).
+
+For example, if you have a group of marketing-related models and you want to place them in a separate schema called `marketing`, you can configure it like this:
+
+
+
+```yml
+models:
+ your_project:
+ marketing: # Grouping or folder for set of models
+ +schema: marketing
+```
+
+
+This would result in the generated relations for these models being located in the `marketing` schema, so the full relation names would be `analytics.marketing.model_name`.
+
+
+
+
+Configure a custom schema in your `dbt_project.yml` file.
+
+For example, if you have a seed that should be placed in a separate schema called `mappings`, you can configure it like this:
+
+
+
+```yml
+seeds:
+ your_project:
+ product_mappings:
+ +schema: mappings
+```
+
+This would result in the generated relation being located in the `mappings` schema, so the full relation name would be `analytics.mappings.product_mappings`.
+
+
+
+
+
+Customize the schema for storing test results in your `dbt_project.yml` file.
+
+For example, to save test results in a specific schema, you can configure it like this:
+
+
+
+
+```yml
+tests:
+ +store_failures: true
+ +schema: test_results
+```
+
+This would result in the test results being stored in the `test_results` schema.
+
+
+
+
+Refer to [Usage](#usage) for more examples.
## Definition
Optionally specify a custom schema for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a schema for a [snapshot](/docs/build/snapshots), use the [`target_schema` config](/reference/resource-configs/target_schema)).
diff --git a/website/docs/reference/resource-configs/severity.md b/website/docs/reference/resource-configs/severity.md
index c89c6db0716..25bab9647d6 100644
--- a/website/docs/reference/resource-configs/severity.md
+++ b/website/docs/reference/resource-configs/severity.md
@@ -6,14 +6,6 @@ resource_types: [tests]
datatype: string
---
-
-
-* `v0.14.0`: Introduced `severity` config
-* `v0.20.0`: Introduced `error_if` + `warn_if` configs. Enabled configuration of tests from `dbt_project.yml`
-* `v0.21.0`: Introduced `config` property for tests
-
-
-
Tests return a number of failures—most often, this is the count of rows returned by the test query, but it could be a [custom calculation](/reference/resource-configs/fail_calc). Generally, if the number of failures is nonzero, the test returns an error. This makes sense, as test queries are designed to return all the rows you _don't_ want: duplicate records, null values, etc.
It's possible to configure tests to return warnings instead of errors, or to make the test status conditional on the number of failures returned. Maybe 1 duplicate record can count as a warning, but 10 duplicate records should count as an error.
diff --git a/website/docs/reference/resource-configs/singlestore-configs.md b/website/docs/reference/resource-configs/singlestore-configs.md
index f503779f0fc..0c93d557a8b 100644
--- a/website/docs/reference/resource-configs/singlestore-configs.md
+++ b/website/docs/reference/resource-configs/singlestore-configs.md
@@ -3,13 +3,6 @@ title: "SingleStore configurations"
id: "singlestore-configs"
---
-
-
-
- - **v1.1.2:** Added support for for `storage_type`, `indexes`, `primary_key`, `sort_key`, `shard_key`, `unique_table_key`, `charset`, `collation` options for creating SingleStore tables.
-
-
-
## Performance Optimizations
[SingleStore Physical Database Schema Design documentation](https://docs.singlestore.com/managed-service/en/create-a-database/physical-database-schema-design/concepts-of-physical-database-schema-design.html) is helpful if you want to use specific options (that are described below) in your dbt project.
diff --git a/website/docs/reference/resource-configs/snowflake-configs.md b/website/docs/reference/resource-configs/snowflake-configs.md
index 42ee3635089..30c7966ec68 100644
--- a/website/docs/reference/resource-configs/snowflake-configs.md
+++ b/website/docs/reference/resource-configs/snowflake-configs.md
@@ -77,7 +77,7 @@ select ...
```
-In this example, you can set up a query tag to be applied to every query with the model's name.
+In this example, you can set up a query tag to be applied to every query with the model's name.
```sql
@@ -301,7 +301,7 @@ models:
-## Temporary Tables
+## Temporary tables
Beginning in dbt version 1.3, incremental table merges for Snowflake prefer to utilize a `view` rather than a `temporary table`. The reasoning was to avoid the database write step that a temporary table would initiate and save compile time.
@@ -341,3 +341,99 @@ In the configuration format for the model SQL file:
+
+
+
+## Dynamic tables
+
+The Snowflake adapter supports [dynamic tables](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table).
+This materialization is specific to Snowflake, which means that any model configuration that
+would normally come along for the ride from `dbt-core` (e.g. as with a `view`) may not be available
+for dynamic tables. This gap will decrease in future patches and versions.
+While this materialization is specific to Snowflake, it very much follows the implementation
+of [materialized views](/docs/build/materializations#Materialized-View).
+In particular, dynamic tables have access to the `on_configuration_change` setting.
+There are also some limitations that we hope to address in the next version.
+
+### Parameters
+
+Dynamic tables in `dbt-snowflake` require the following parameters:
+- `target_lag`
+- `snowflake_warehouse`
+- `on_configuration_change`
+
+To learn more about each parameter and what values it can take, see
+the Snowflake docs page: [`CREATE DYNAMIC TABLE: Parameters`](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table).
+
+### Usage
+
+You can create a dynamic table by editing _one_ of these files:
+
+- the SQL file for your model
+- the `dbt_project.yml` configuration file
+
+The following examples create a dynamic table:
+
+
+
+```sql
+{{ config(
+ materialized = 'dynamic_table',
+ snowflake_warehouse = 'snowflake_warehouse',
+ target_lag = '10 minutes',
+) }}
+```
+
+
+
+
+
+```yaml
+models:
+ path:
+ materialized: dynamic_table
+ snowflake_warehouse: snowflake_warehouse
+ target_lag: '10 minutes'
+```
+
+
+
+### Monitored configuration changes
+
+The settings below are monitored for changes applicable to `on_configuration_change`.
+
+#### Target lag
+
+Changes to `target_lag` can be applied by running an `ALTER` statement. Refreshing is essentially
+always on for dynamic tables; this setting changes how frequently the dynamic table is updated.
+
+#### Warehouse
+
+Changes to `snowflake_warehouse` can be applied via an `ALTER` statement.
+
+### Limitations
+
+#### Changing materialization to and from "dynamic_table"
+
+Swapping an already materialized model to be a dynamic table and vice versa.
+The workaround is manually dropping the existing materialization in the data warehouse prior to calling `dbt run`.
+Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case.
+This would only need to be done once as the existing object would then be a dynamic table.
+
+For example, assume for the example model below, `my_model`, has already been materialized to the underlying data platform via `dbt run`.
+If the user changes the model's config to `materialized="dynamic_table"`, they will get an error.
+The workaround is to execute `DROP TABLE my_model` on the data warehouse before trying the model again.
+
+
+
+```yaml
+
+{{ config(
+ materialized="table" # or any model type eg view, incremental
+) }}
+
+```
+
+
+
+
diff --git a/website/docs/reference/resource-configs/spark-configs.md b/website/docs/reference/resource-configs/spark-configs.md
index 95a853107f6..ce3b317f0f1 100644
--- a/website/docs/reference/resource-configs/spark-configs.md
+++ b/website/docs/reference/resource-configs/spark-configs.md
@@ -29,12 +29,6 @@ When materializing a model as `table`, you may include several optional configs
## Incremental models
-
-
- - `dbt-spark==0.19.0`: Added the `append` strategy as default for all platforms, file types, and connection methods.
-
-
-
dbt seeks to offer useful, intuitive modeling abstractions by means of its built-in configurations and materializations . Because there is so much variance between Apache Spark clusters out in the world—not to mention the powerful features offered to Databricks users by the Delta file format and custom runtime—making sense of all the available options is an undertaking in its own right.
Alternatively, you can use Apache Iceberg or Apache Hudi file format with Apache Spark runtime for building incremental models.
@@ -192,13 +186,6 @@ insert overwrite table analytics.spark_incremental
### The `merge` strategy
-
-
- - `dbt-spark==0.15.3`: Introduced `merge` incremental strategy
-
-
-
-
**Usage notes:** The `merge` incremental strategy requires:
- `file_format: delta, iceberg or hudi`
- Databricks Runtime 5.1 and above for delta file format
@@ -294,12 +281,6 @@ or `show table extended in [database] like '*'`.
## Always `schema`, never `database`
-
-
- - `dbt-spark==0.17.0` ended use of `database` in all cases.
-
-
-
Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands
`database` to exist at a higher level than `schema`. As such, you should _never_
use or set `database` as a node config or in the target profile when running dbt-spark.
diff --git a/website/docs/reference/resource-configs/starrocks-configs.md b/website/docs/reference/resource-configs/starrocks-configs.md
new file mode 100644
index 00000000000..093534515c6
--- /dev/null
+++ b/website/docs/reference/resource-configs/starrocks-configs.md
@@ -0,0 +1,116 @@
+---
+title: "Starrocks configurations"
+id: "starrocks-configs"
+description: "Starrocks Configurations - Read this in-depth guide to learn about configurations in dbt."
+---
+
+## Model Configuration
+
+A dbt model can be configured using the following syntax:
+
+
+
+
+
+
+
+```yaml
+models:
+ :
+ materialized: table // table or view or materialized_view
+ keys: ['id', 'name', 'some_date']
+ table_type: 'PRIMARY' // PRIMARY or DUPLICATE or UNIQUE
+ distributed_by: ['id']
+ buckets: 3 // default 10
+ partition_by: ['some_date']
+ partition_by_init: ["PARTITION p1 VALUES [('1971-01-01 00:00:00'), ('1991-01-01 00:00:00')),PARTITION p1972 VALUES [('1991-01-01 00:00:00'), ('1999-01-01 00:00:00'))"]
+ properties: [{"replication_num":"1", "in_memory": "true"}]
+ refresh_method: 'async' // only for materialized view default manual
+```
+
+
+
+
+
+
+
+```yaml
+models:
+ - name:
+ config:
+ materialized: table // table or view or materialized_view
+ keys: ['id', 'name', 'some_date']
+ table_type: 'PRIMARY' // PRIMARY or DUPLICATE or UNIQUE
+ distributed_by: ['id']
+ buckets: 3 // default 10
+ partition_by: ['some_date']
+ partition_by_init: ["PARTITION p1 VALUES [('1971-01-01 00:00:00'), ('1991-01-01 00:00:00')),PARTITION p1972 VALUES [('1991-01-01 00:00:00'), ('1999-01-01 00:00:00'))"]
+ properties: [{"replication_num":"1", "in_memory": "true"}]
+ refresh_method: 'async' // only for materialized view default manual
+```
+
+
+
+
+
+
+
+```jinja
+{{ config(
+ materialized = 'table',
+ keys=['id', 'name', 'some_date'],
+ table_type='PRIMARY',
+ distributed_by=['id'],
+ buckets=3,
+ partition_by=['some_date'],
+ ....
+) }}
+```
+
+
+
+
+### Configuration Description
+
+| Option | Description |
+|---------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `materialized` | How the model will be materialized into Starrocks. Supports view, table, incremental, ephemeral, and materialized_view. |
+| `keys` | Which columns serve as keys. |
+| `table_type` | Table type, supported are PRIMARY or DUPLICATE or UNIQUE. |
+| `distributed_by` | Specifies the column of data distribution. If not specified, it defaults to random. |
+| `buckets` | The bucket number in one partition. If not specified, it will be automatically inferred. |
+| `partition_by` | The partition column list. |
+| `partition_by_init` | The partition rule or some real partitions item. |
+| `properties` | The table properties configuration of Starrocks. ([Starrocks table properties](https://docs.starrocks.io/en-us/latest/sql-reference/sql-statements/data-definition/CREATE_TABLE#properties)) |
+| `refresh_method` | How to refresh materialized views. |
+
+## Read From Catalog
+First you need to add this catalog to starrocks. The following is an example of hive.
+
+```sql
+CREATE EXTERNAL CATALOG `hive_catalog`
+PROPERTIES (
+ "hive.metastore.uris" = "thrift://127.0.0.1:8087",
+ "type"="hive"
+);
+```
+How to add other types of catalogs can be found in the documentation. [Catalog Overview](https://docs.starrocks.io/en-us/latest/data_source/catalog/catalog_overview) Then write the sources.yaml file.
+```yaml
+sources:
+ - name: external_example
+ schema: hive_catalog.hive_db
+ tables:
+ - name: hive_table_name
+```
+Finally, you might use below marco quote
+```jinja
+{{ source('external_example', 'hive_table_name') }}
+```
\ No newline at end of file
diff --git a/website/docs/reference/resource-configs/store_failures.md b/website/docs/reference/resource-configs/store_failures.md
index 62ae33ba713..2c596d1cf3e 100644
--- a/website/docs/reference/resource-configs/store_failures.md
+++ b/website/docs/reference/resource-configs/store_failures.md
@@ -3,21 +3,14 @@ resource_types: [tests]
datatype: boolean
---
-
-
-* `v0.20.0`: Introduced `store_failures` config and functionality
-* `v0.21.0`: Introduced `config` property for tests
-
-
-
-The configured test(s) will store their failures when `dbt test --store-failures` is invoked.
+The configured test(s) will store their failures when `dbt test --store-failures` is invoked. If you set this configuration as `false` but [`store_failures_as`](/reference/resource-configs/store_failures_as) is configured, it will be overriden.
## Description
Optionally set a test to always or never store its failures in the database.
- If specified as `true` or `false`, the
`store_failures` config will take precedence over the presence or absence of the `--store-failures` flag.
- If the `store_failures` config is `none` or omitted, the resource will use the value of the `--store-failures` flag.
-- When true, `store_failures` save all the record(s) that failed the test only if [limit](/reference/resource-configs/limit) is not set or if there are fewer records than the limit. `store_failures` are saved in a new table with the name of the test. By default, `store_failures` use a schema named `dbt_test__audit`, but, you can configure the schema to a different value.
+- When true, `store_failures` save all the record(s) that failed the test only if [limit](/reference/resource-configs/limit) is not set or if there are fewer records than the limit. `store_failures` are saved in a new table with the name of the test. By default, `store_failures` use a schema named `dbt_test__audit`, but, you can [configure](/reference/resource-configs/schema#tests) the schema to a different value.
This logic is encoded in the [`should_store_failures()`](https://github.com/dbt-labs/dbt-core/blob/98c015b7754779793e44e056905614296c6e4527/core/dbt/include/global_project/macros/materializations/helpers.sql#L77) macro.
diff --git a/website/docs/reference/resource-configs/store_failures_as.md b/website/docs/reference/resource-configs/store_failures_as.md
new file mode 100644
index 00000000000..a9149360089
--- /dev/null
+++ b/website/docs/reference/resource-configs/store_failures_as.md
@@ -0,0 +1,76 @@
+---
+resource_types: [tests]
+id: "store_failures_as"
+---
+
+For the `test` resource type, `store_failures_as` is an optional config that specifies how test failures should be stored in the database. If [`store_failures`](/reference/resource-configs/store_failures) is also configured, `store_failures_as` takes precedence.
+
+The three supported values are:
+
+- `ephemeral` — nothing stored in the database (default)
+- `table` — test failures stored as a database table
+- `view` — test failures stored as a database view
+
+You can configure it in all the same places as `store_failures`, including singular tests (.sql files), generic tests (.yml files), and dbt_project.yml.
+
+### Examples
+
+#### Singular test
+
+[Singular test](https://docs.getdbt.com/docs/build/tests#singular-tests) in `tests/singular/check_something.sql` file
+
+```sql
+{{ config(store_failures_as="table") }}
+
+-- custom singular test
+select 1 as id
+where 1=0
+```
+
+#### Generic test
+
+[Generic tests](https://docs.getdbt.com/docs/build/tests#generic-tests) in `models/_models.yml` file
+
+```yaml
+models:
+ - name: my_model
+ columns:
+ - name: id
+ tests:
+ - not_null:
+ config:
+ store_failures_as: view
+ - unique:
+ config:
+ store_failures_as: ephemeral
+```
+
+#### Project level
+
+Config in `dbt_project.yml`
+
+```yaml
+name: "my_project"
+version: "1.0.0"
+config-version: 2
+profile: "sandcastle"
+
+tests:
+ my_project:
+ +store_failures_as: table
+ my_subfolder_1:
+ +store_failures_as: view
+ my_subfolder_2:
+ +store_failures_as: ephemeral
+```
+
+### "Clobbering" configs
+
+As with most other configurations, `store_failures_as` is "clobbered" when applied hierarchically. Whenever a more specific value is available, it will completely replace the less specific value.
+
+Additional resources:
+
+- [Test configurations](/reference/test-configs#related-documentation)
+- [Test-specific configurations](/reference/test-configs#test-specific-configurations)
+- [Configuring directories of models in dbt_project.yml](/reference/model-configs#configuring-directories-of-models-in-dbt_projectyml)
+- [Config inheritance](/reference/configs-and-properties#config-inheritance)
\ No newline at end of file
diff --git a/website/docs/reference/resource-configs/teradata-configs.md b/website/docs/reference/resource-configs/teradata-configs.md
index f0f4f1a6f3e..12a8929429d 100644
--- a/website/docs/reference/resource-configs/teradata-configs.md
+++ b/website/docs/reference/resource-configs/teradata-configs.md
@@ -35,14 +35,21 @@ id: "teradata-configs"
###
* `table_kind` - define the table kind. Legal values are `MULTISET` (default for ANSI transaction mode required by `dbt-teradata`) and `SET`, e.g.:
- ```yaml
- {{
- config(
- materialized="table",
- table_kind="SET"
- )
- }}
- ```
+ * in sql materialization definition file:
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ table_kind="SET"
+ )
+ }}
+ ```
+ * in seed configuration:
+ ```yaml
+ seeds:
+ :
+ table_kind: "SET"
+ ```
For details, see [CREATE TABLE documentation](https://docs.teradata.com/r/76g1CuvvQlYBjb2WPIuk3g/B6Js16DRQVwPDjgJ8rz7hg).
* `table_option` - defines table options. The config supports multiple statements. The definition below uses the Teradata syntax definition to explain what statements are allowed. Square brackets `[]` denote optional parameters. The pipe symbol `|` separates statements. Use commas to combine multiple statements as shown in the examples below:
```
@@ -87,37 +94,51 @@ id: "teradata-configs"
```
Examples:
-
- :::info Separators between statements
- Note the commas that separate statements in `table_option` config.
- :::
-
- ```yaml
- {{
- config(
- materialized="table",
- table_option="NO FALLBACK"
- )
- }}
- ```
- ```yaml
- {{
- config(
- materialized="table",
- table_option="NO FALLBACK, NO JOURNAL"
- )
- }}
- ```
- ```yaml
- {{
- config(
- materialized="table",
- table_option="NO FALLBACK, NO JOURNAL, CHECKSUM = ON,
- NO MERGEBLOCKRATIO,
- WITH CONCURRENT ISOLATED LOADING FOR ALL"
- )
- }}
- ```
+ * in sql materialization definition file:
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ table_option="NO FALLBACK"
+ )
+ }}
+ ```
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ table_option="NO FALLBACK, NO JOURNAL"
+ )
+ }}
+ ```
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ table_option="NO FALLBACK, NO JOURNAL, CHECKSUM = ON,
+ NO MERGEBLOCKRATIO,
+ WITH CONCURRENT ISOLATED LOADING FOR ALL"
+ )
+ }}
+ ```
+ * in seed configuration:
+ ```yaml
+ seeds:
+ :
+ table_option:"NO FALLBACK"
+ ```
+ ```yaml
+ seeds:
+ :
+ table_option:"NO FALLBACK, NO JOURNAL"
+ ```
+ ```yaml
+ seeds:
+ :
+ table_option: "NO FALLBACK, NO JOURNAL, CHECKSUM = ON,
+ NO MERGEBLOCKRATIO,
+ WITH CONCURRENT ISOLATED LOADING FOR ALL"
+ ```
For details, see [CREATE TABLE documentation](https://docs.teradata.com/r/76g1CuvvQlYBjb2WPIuk3g/B6Js16DRQVwPDjgJ8rz7hg).
@@ -160,46 +181,67 @@ id: "teradata-configs"
```
Examples:
-
- :::info Separators between statements
- Note, unlike with `table_option` statements, there are no commas between statements in `index` config.
- :::
-
- ```yaml
- {{
- config(
- materialized="table",
- index="UNIQUE PRIMARY INDEX ( GlobalID )"
- )
- }}
- ```
-
- ```yaml
- {{
- config(
- materialized="table",
- index="PRIMARY INDEX(id)
- PARTITION BY RANGE_N(create_date
- BETWEEN DATE '2020-01-01'
- AND DATE '2021-01-01'
- EACH INTERVAL '1' MONTH)"
- )
- }}
- ```
-
- ```yaml
- {{
- config(
- materialized="table",
- index="PRIMARY INDEX(id)
- PARTITION BY RANGE_N(create_date
- BETWEEN DATE '2020-01-01'
- AND DATE '2021-01-01'
- EACH INTERVAL '1' MONTH)
- INDEX index_attrA (attrA) WITH LOAD IDENTITY"
- )
- }}
- ```
+ * in sql materialization definition file:
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ index="UNIQUE PRIMARY INDEX ( GlobalID )"
+ )
+ }}
+ ```
+ > :information_source: Note, unlike in `table_option`, there are no commas between index statements!
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ index="PRIMARY INDEX(id)
+ PARTITION BY RANGE_N(create_date
+ BETWEEN DATE '2020-01-01'
+ AND DATE '2021-01-01'
+ EACH INTERVAL '1' MONTH)"
+ )
+ }}
+ ```
+ ```yaml
+ {{
+ config(
+ materialized="table",
+ index="PRIMARY INDEX(id)
+ PARTITION BY RANGE_N(create_date
+ BETWEEN DATE '2020-01-01'
+ AND DATE '2021-01-01'
+ EACH INTERVAL '1' MONTH)
+ INDEX index_attrA (attrA) WITH LOAD IDENTITY"
+ )
+ }}
+ ```
+ * in seed configuration:
+ ```yaml
+ seeds:
+ :
+ index: "UNIQUE PRIMARY INDEX ( GlobalID )"
+ ```
+ > :information_source: Note, unlike in `table_option`, there are no commas between index statements!
+ ```yaml
+ seeds:
+ :
+ index: "PRIMARY INDEX(id)
+ PARTITION BY RANGE_N(create_date
+ BETWEEN DATE '2020-01-01'
+ AND DATE '2021-01-01'
+ EACH INTERVAL '1' MONTH)"
+ ```
+ ```yaml
+ seeds:
+ :
+ index: "PRIMARY INDEX(id)
+ PARTITION BY RANGE_N(create_date
+ BETWEEN DATE '2020-01-01'
+ AND DATE '2021-01-01'
+ EACH INTERVAL '1' MONTH)
+ INDEX index_attrA (attrA) WITH LOAD IDENTITY"
+ ```
## Seeds
:::info Using seeds to load raw data
@@ -220,6 +262,35 @@ Loading CSVs using dbt's seed functionality is not performant for large files. C
+use_fastload: true
```
+#### Grants
+
+Grants are supported in dbt-teradata adapter with release version 1.2.0 and above. You can use grants to manage access to the datasets you're producing with dbt. To implement these permissions, define grants as resource configs on each model, seed, or snapshot. Define the default grants that apply to the entire project in your `dbt_project.yml`, and define model-specific grants within each model's SQL or YAML file.
+
+for e.g. :
+ models/schema.yml
+ ```yaml
+ models:
+ - name: model_name
+ config:
+ grants:
+ select: ['user_a', 'user_b']
+ ```
+
+Another e.g. for adding multiple grants:
+
+ ```yaml
+ models:
+ - name: model_name
+ config:
+ materialized: table
+ grants:
+ select: ["user_b"]
+ insert: ["user_c"]
+ ```
+> :information_source: `copy_grants` is not supported in Teradata.
+
+More on Grants can be found at https://docs.getdbt.com/reference/resource-configs/grants
+
## Common Teradata-specific tasks
* *collect statistics* - when a table is created or modified significantly, there might be a need to tell Teradata to collect statistics for the optimizer. It can be done using `COLLECT STATISTICS` command. You can perform this step using dbt's `post-hooks`, e.g.:
diff --git a/website/docs/reference/resource-configs/upsolver-configs.md b/website/docs/reference/resource-configs/upsolver-configs.md
new file mode 100644
index 00000000000..b917ee2cc58
--- /dev/null
+++ b/website/docs/reference/resource-configs/upsolver-configs.md
@@ -0,0 +1,464 @@
+---
+title: "Upsolver configurations"
+id: "upsolver-configs"
+description: "Upsolver Configurations - Read this in-depth guide to learn about configurations in dbt."
+---
+
+## Supported Upsolver SQLake functionality
+
+| COMMAND | STATE | MATERIALIZED |
+| ------ | ------ | ------ |
+| SQL compute cluster| not supported | - |
+| SQL connections| supported | connection |
+| SQL copy job | supported | incremental |
+| SQL merge job | supported | incremental |
+| SQL insert job | supported | incremental |
+| SQL materialized views | supported | materializedview |
+| Expectations | supported | incremental |
+
+## Configs materialization
+
+| Config | Required | Materialization | Description | Example |
+| ------ | --------- | --------------- | ---------- | ------- |
+| connection_type | Yes | connection | Connection identifier: S3/GLUE_CATALOG/KINESIS | connection_type='S3' |
+| connection_options | Yes | connection | Dictionary of options supported by selected connection | connection_options={ 'aws_role': 'aws_role', 'external_id': 'SAMPLES', 'read_only': True } |
+| incremental_strategy | No | incremental | Define one of incremental strategies: merge/copy/insert. Default: copy | incremental_strategy='merge' |
+| source | No | incremental | Define source to copy from: S3/KAFKA/KINESIS | source = 'S3' |
+| target_type | No | incremental | Define target type REDSHIFT/ELASTICSEARCH/S3/SNOWFLAKE/POSTGRES. Default None for Data lake | target_type='Snowflake' |
+| target_prefix | False | incremental | Define PREFIX for ELASTICSEARCH target type | target_prefix = 'orders' |
+| target_location | False | incremental | Define LOCATION for S3 target type | target_location = 's3://your-bucket-name/path/to/folder/' |
+| schema | Yes/No | incremental | Define target schema. Required if target_type, no table created in a metastore connection | schema = 'target_schema' |
+| database | Yes/No | incremental | Define target connection. Required if target_type, no table created in a metastore connection | database = 'target_connection' |
+| alias | Yes/No | incremental | Define target table. Required if target_type, no table created in a metastore connection | alias = 'target_table' |
+| delete_condition | No | incremental | Records that match the ON condition and a delete condition can be deleted | delete_condition='nettotal > 1000' |
+| partition_by | No | incremental | List of dictionaries to define partition_by for target metastore table | partition_by=[{'field':'$field_name'}] |
+| primary_key | No | incremental | List of dictionaries to define partition_by for target metastore table | primary_key=[{'field':'customer_email', 'type':'string'}] |
+| map_columns_by_name | No | incremental | Maps columns from the SELECT statement to the table. Boolean. Default: False | map_columns_by_name=True |
+| sync | No | incremental/materializedview | Boolean option to define if job is synchronized or non-msynchronized. Default: False | sync=True |
+| options | No | incremental/materializedview | Dictionary of job options | options={ 'START_FROM': 'BEGINNING', 'ADD_MISSING_COLUMNS': True } |
+
+## SQL connection
+
+Connections are used to provide Upsolver with the proper credentials to bring your data into SQLake as well as to write out your transformed data to various services. More details on ["Upsolver SQL connections"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-connections)
+As a dbt model connection is a model with materialized='connection'
+
+```sql
+{{ config(
+ materialized='connection',
+ connection_type={ 'S3' | 'GLUE_CATALOG' | 'KINESIS' | 'KAFKA'| 'SNOWFLAKE' },
+ connection_options={}
+ )
+}}
+```
+
+Running this model will compile CREATE CONNECTION(or ALTER CONNECTION if exists) SQL and send it to Upsolver engine. Name of the connection will be name of the model.
+
+## SQL copy job
+
+A COPY FROM job allows you to copy your data from a given source into a table created in a metastore connection. This table then serves as your staging table and can be used with SQLake transformation jobs to write to various target locations. More details on ["Upsolver SQL copy-from"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/copy-from)
+
+As a dbt model copy job is model with materialized='incremental'
+
+```sql
+{{ config( materialized='incremental',
+ sync=True|False,
+ source = 'S3'| 'KAFKA' | ... ,
+ options={
+ 'option_name': 'option_value'
+ },
+ partition_by=[{}]
+ )
+}}
+SELECT * FROM {{ ref() }}
+```
+
+Running this model will compile CREATE TABLE SQL for target type Data lake (or ALTER TABLE if exists) and CREATE COPY JOB(or ALTER COPY JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job'
+
+## SQL insert job
+
+An INSERT job defines a query that pulls in a set of data based on the given SELECT statement and inserts it into the designated target. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL insert"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/insert).
+
+As a dbt model insert job is model with materialized='incremental' and incremental_strategy='insert'
+
+```sql
+{{ config( materialized='incremental',
+ sync=True|False,
+ map_columns_by_name=True|False,
+ incremental_strategy='insert',
+ options={
+ 'option_name': 'option_value'
+ },
+ primary_key=[{}]
+ )
+}}
+SELECT ...
+FROM {{ ref() }}
+WHERE ...
+GROUP BY ...
+HAVING COUNT(DISTINCT orderid::string) ...
+```
+
+Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE INSERT JOB(or ALTER INSERT JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job'
+
+## SQL merge job
+
+A MERGE job defines a query that pulls in a set of data based on the given SELECT statement and inserts into, replaces, or deletes the data from the designated target based on the job definition. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL merge"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/merge).
+
+As a dbt model merge job is model with materialized='incremental' and incremental_strategy='merge'
+
+```sql
+{{ config( materialized='incremental',
+ sync=True|False,
+ map_columns_by_name=True|False,
+ incremental_strategy='merge',
+ options={
+ 'option_name': 'option_value'
+ },
+ primary_key=[{}]
+ )
+}}
+SELECT ...
+FROM {{ ref() }}
+WHERE ...
+GROUP BY ...
+HAVING COUNT ...
+```
+
+Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE MERGE JOB(or ALTER MERGE JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job'
+
+## SQL materialized views
+
+When transforming your data, you may find that you need data from multiple source tables in order to achieve your desired result.
+In such a case, you can create a materialized view from one SQLake table in order to join it with your other table (which in this case is considered the main table). More details on ["Upsolver SQL materialized views"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/sql-materialized-views).
+
+As a dbt model materialized views is model with materialized='materializedview'.
+
+```sql
+{{ config( materialized='materializedview',
+ sync=True|False,
+ options={'option_name': 'option_value'}
+ )
+}}
+SELECT ...
+FROM {{ ref() }}
+WHERE ...
+GROUP BY ...
+```
+
+Running this model will compile CREATE MATERIALIZED VIEW SQL(or ALTER MATERIALIZED VIEW if exists) and send it to Upsolver engine. Name of the materializedview will be name of the model.
+
+## Expectations/constraints
+
+Data quality conditions can be added to your job to drop a row or trigger a warning when a column violates a predefined condition.
+
+```sql
+WITH EXPECTATION EXPECT
+ON VIOLATION WARN
+```
+
+Expectations can be implemented with dbt constraints
+Supported constraints: check and not_null
+
+```yaml
+models:
+ - name:
+ # required
+ config:
+ contract:
+ enforced: true
+ # model-level constraints
+ constraints:
+ - type: check
+ columns: ['', '']
+ expression: "column1 <= column2"
+ name:
+ - type: not_null
+ columns: ['column1', 'column2']
+ name:
+
+ columns:
+ - name:
+ data_type: string
+
+ # column-level constraints
+ constraints:
+ - type: not_null
+ - type: check
+ expression: "REGEXP_LIKE(, '^[0-9]{4}[a-z]{5}$')"
+ name:
+```
+
+## Projects examples
+
+> projects examples link: [github.com/dbt-upsolver/examples/](https://github.com/Upsolver/dbt-upsolver/tree/main/examples)
+
+## Connection options
+
+| Option | Storage | Editable | Optional | Config Syntax |
+| -------| --------- | -------- | -------- | ------------- |
+| aws_role | s3 | True | True | 'aws_role': `''` |
+| external_id | s3 | True | True | 'external_id': `''` |
+| aws_access_key_id | s3 | True | True | 'aws_access_key_id': `''` |
+| aws_secret_access_key | s3 | True | True | 'aws_secret_access_key_id': `''` |
+| path_display_filter | s3 | True | True | 'path_display_filter': `''` |
+| path_display_filters | s3 | True | True | 'path_display_filters': (`''`, ...) |
+| read_only | s3 | True | True | 'read_only': True/False |
+| encryption_kms_key | s3 | True | True | 'encryption_kms_key': `''` |
+| encryption_customer_managed_key | s3 | True | True | 'encryption_customer_kms_key': `''` |
+| comment | s3 | True | True | 'comment': `''` |
+| host | kafka | False | False | 'host': `''` |
+| hosts | kafka | False | False | 'hosts': (`''`, ...) |
+| consumer_properties | kafka | True | True | 'consumer_properties': `''` |
+| version | kafka | False | True | 'version': `''` |
+| require_static_ip | kafka | True | True | 'require_static_ip': True/False |
+| ssl | kafka | True | True | 'ssl': True/False |
+| topic_display_filter | kafka | True | True | 'topic_display_filter': `''` |
+| topic_display_filters | kafka | True | True | 'topic_display_filter': (`''`, ...) |
+| comment | kafka | True | True | 'comment': `''` |
+| aws_role | glue_catalog | True | True | 'aws_role': `''` |
+| external_id | glue_catalog | True | True | 'external_id': `''` |
+| aws_access_key_id | glue_catalog | True | True | 'aws_access_key_id': `''` |
+| aws_secret_access_key | glue_catalog | True | True | 'aws_secret_access_key': `''` |
+| default_storage_connection | glue_catalog | False | False | 'default_storage_connection': `''` |
+| default_storage_location | glue_catalog | False | False | 'default_storage_location': `''` |
+| region | glue_catalog | False | True | 'region': `''` |
+| database_display_filter | glue_catalog | True | True | 'database_display_filter': `''` |
+| database_display_filters | glue_catalog | True | True | 'database_display_filters': (`''`, ...) |
+| comment | glue_catalog | True | True | 'comment': `''` |
+| aws_role | kinesis | True | True | 'aws_role': `''` |
+| external_id | kinesis | True | True | 'external_id': `''` |
+| aws_access_key_id | kinesis | True | True | 'aws_access_key_id': `''` |
+| aws_secret_access_key | kinesis | True | True | 'aws_secret_access_key': `''` |
+| region | kinesis | False | False | 'region': `''` |
+| read_only | kinesis | False | True | 'read_only': True/False |
+| max_writers | kinesis | True | True | 'max_writers': `` |
+| stream_display_filter | kinesis | True | True | 'stream_display_filter': `''` |
+| stream_display_filters | kinesis | True | True | 'stream_display_filters': (`''`, ...) |
+| comment | kinesis | True | True | 'comment': `''` |
+| connection_string | snowflake | True | False | 'connection_string': `''` |
+| user_name | snowflake | True | False | 'user_name': `''` |
+| password | snowflake | True | False | 'password': `''` |
+| max_concurrent_connections | snowflake | True | True | 'max_concurrent_connections': `` |
+| comment | snowflake | True | True | 'comment': `''` |
+| connection_string | redshift | True | False | 'connection_string': `''` |
+| user_name | redshift | True | False | 'user_name': `''` |
+| password | redshift | True | False | 'password': `''` |
+| max_concurrent_connections | redshift | True | True | 'max_concurrent_connections': `` |
+| comment | redshift | True | True | 'comment': `''` |
+| connection_string | mysql | True | False | 'connection_string': `''` |
+| user_name | mysql | True | False | 'user_name': `''` |
+| password | mysql | True | False | 'password': `''` |
+| comment | mysql | True | True | 'comment': `''` |
+| connection_string | postgres | True | False | 'connection_string': `''` |
+| user_name | postgres | True | False | 'user_name': `''` |
+| password | postgres | True | False | 'password': `''` |
+| comment | postgres | True | True | 'comment': `''` |
+| connection_string | elasticsearch | True | False | 'connection_string': `''` |
+| user_name | elasticsearch | True | False | 'user_name': `''` |
+| password | elasticsearch | True | False | 'password': `''` |
+| comment | elasticsearch | True | True | 'comment': `''` |
+| connection_string | mongodb | True | False | 'connection_string': `''` |
+| user_name | mongodb | True | False | 'user_name': `''` |
+| password | mongodb | True | False | 'password': `''` |
+| timeout | mongodb | True | True | 'timeout': "INTERVAL 'N' SECONDS" |
+| comment | mongodb | True | True | 'comment': `''` |
+| connection_string | mssql | True | False | 'connection_string': `''` |
+| user_name | mssql | True | False | 'user_name': `''` |
+| password | mssql | True | False | 'password': `''` |
+| comment | mssql | True | True | 'comment': `''` |
+
+## Target options
+
+| Option | Storage | Editable | Optional | Config Syntax |
+| -------| --------- | -------- | -------- | ------------- |
+| globally_unique_keys | datalake | False | True | 'globally_unique_keys': True/False |
+| storage_connection | datalake | False | True | 'storage_connection': `''` |
+| storage_location | datalake | False | True | 'storage_location': `''` |
+| compute_cluster | datalake | True | True | 'compute_cluster': `''` |
+| compression | datalake | True | True | 'compression': 'SNAPPY/GZIP' |
+| compaction_processes | datalake | True | True | 'compaction_processes': `` |
+| disable_compaction | datalake | True | True | 'disable_compaction': True/False |
+| retention_date_partition | datalake | False | True | 'retention_date_partition': `''` |
+| table_data_retention | datalake | True | True | 'table_data_retention': `''` |
+| column_data_retention | datalake | True | True | 'column_data_retention': ({'COLUMN' : `''`,'DURATION': `''`}) |
+| comment | datalake | True | True | 'comment': `''` |
+| storage_connection | materialized_view | False | True | 'storage_connection': `''` |
+| storage_location | materialized_view | False | True | 'storage_location': `''` |
+| max_time_travel_duration | materialized_view | True | True | 'max_time_travel_duration': `''` |
+| compute_cluster | materialized_view | True | True | 'compute_cluster': `''` |
+| column_transformations | snowflake | False | True | 'column_transformations': {`''` : `''` , ...} |
+| deduplicate_with | snowflake | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} |
+| exclude_columns | snowflake | False | True | 'exclude_columns': (`''`, ...) |
+| create_table_if_missing | snowflake | False | True | 'create_table_if_missing': True/False} |
+| run_interval | snowflake | False | True | 'run_interval': `''` |
+
+## Transformation options
+
+| Option | Storage | Editable | Optional | Config Syntax |
+| -------| --------- | -------- | -------- | ------------- |
+| run_interval | s3 | False | True | 'run_interval': `''` |
+| start_from | s3 | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| end_at | s3 | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | s3 | True | True | 'compute_cluster': `''` |
+| comment | s3 | True | True | 'comment': `''` |
+| skip_validations | s3 | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) |
+| skip_all_validations | s3 | False | True | 'skip_all_validations': True/False |
+| aggregation_parallelism | s3 | True | True | 'aggregation_parallelism': `` |
+| run_parallelism | s3 | True | True | 'run_parallelism': `` |
+| file_format | s3 | False | False | 'file_format': '(type = ``)' |
+| compression | s3 | False | True | 'compression': 'SNAPPY/GZIP ...' |
+| date_pattern | s3 | False | True | 'date_pattern': `''` |
+| output_offset | s3 | False | True | 'output_offset': `''` |
+| run_interval | elasticsearch | False | True | 'run_interval': `''` |
+| routing_field_name | elasticsearch | True | True | 'routing_field_name': `''` |
+| start_from | elasticsearch | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| end_at | elasticsearch | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | elasticsearch | True | True | 'compute_cluster': `''` |
+| skip_validations | elasticsearch | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) |
+| skip_all_validations | elasticsearch | False | True | 'skip_all_validations': True/False |
+| aggregation_parallelism | elasticsearch | True | True | 'aggregation_parallelism': `` |
+| run_parallelism | elasticsearch | True | True | 'run_parallelism': `` |
+| bulk_max_size_bytes | elasticsearch | True | True | 'bulk_max_size_bytes': `` |
+| index_partition_size | elasticsearch | True | True | 'index_partition_size': 'HOURLY/DAILY ...' |
+| comment | elasticsearch | True | True | 'comment': `''` |
+| custom_insert_expressions | snowflake | True | True | 'custom_insert_expressions': {'INSERT_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} |
+| custom_update_expressions | snowflake | True | True | 'custom_update_expressions': {'UPDATE_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} |
+| keep_existing_values_when_null | snowflake | True | True | 'keep_existing_values_when_null': True/False |
+| add_missing_columns | snowflake | False | True | 'add_missing_columns': True/False |
+| run_interval | snowflake | False | True | 'run_interval': `''` |
+| commit_interval | snowflake | True | True | 'commit_interval': `''` |
+| start_from | snowflake | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| end_at | snowflake | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | snowflake | True | True | 'compute_cluster': `''` |
+| skip_validations | snowflake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) |
+| skip_all_validations | snowflake | False | True | 'skip_all_validations': True/False |
+| aggregation_parallelism | snowflake | True | True | 'aggregation_parallelism': `` |
+| run_parallelism | snowflake | True | True | 'run_parallelism': `` |
+| comment | snowflake | True | True | 'comment': `''` |
+| add_missing_columns | datalake | False | True | 'add_missing_columns': True/False |
+| run_interval | datalake | False | True | 'run_interval': `''` |
+| start_from | datalake | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| end_at | datalake | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | datalake | True | True | 'compute_cluster': `''` |
+| skip_validations | datalake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) |
+| skip_all_validations | datalake | False | True | 'skip_all_validations': True/False |
+| aggregation_parallelism | datalake | True | True | 'aggregation_parallelism': `` |
+| run_parallelism | datalake | True | True | 'run_parallelism': `` |
+| comment | datalake | True | True | 'comment': `''` |
+| run_interval | redshift | False | True | 'run_interval': `''` |
+| start_from | redshift | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| end_at | redshift | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | redshift | True | True | 'compute_cluster': `''` |
+| skip_validations | redshift | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) |
+| skip_all_validations | redshift | False | True | 'skip_all_validations': True/False |
+| aggregation_parallelism | redshift | True | True | 'aggregation_parallelism': `` |
+| run_parallelism | redshift | True | True | 'run_parallelism': `` |
+| skip_failed_files | redshift | False | True | 'skip_failed_files': True/False |
+| fail_on_write_error | redshift | False | True | 'fail_on_write_error': True/False |
+| comment | redshift | True | True | 'comment': `''` |
+| run_interval | postgres | False | True | 'run_interval': `''` |
+| start_from | postgres | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| end_at | postgres | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | postgres | True | True | 'compute_cluster': `''` |
+| skip_validations | postgres | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) |
+| skip_all_validations | postgres | False | True | 'skip_all_validations': True/False |
+| aggregation_parallelism | postgres | True | True | 'aggregation_parallelism': `` |
+| run_parallelism | postgres | True | True | 'run_parallelism': `` |
+| comment | postgres | True | True | 'comment': `''` |
+
+## Copy options
+
+| Option | Storage | Category | Editable | Optional | Config Syntax |
+| -------| ---------- | -------- | -------- | -------- | ------------- |
+| topic | kafka | source_options | False | False | 'topic': `''` |
+| exclude_columns | kafka | job_options | False | True | 'exclude_columns': (`''`, ...) |
+| deduplicate_with | kafka | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} |
+| consumer_properties | kafka | job_options | True | True | 'consumer_properties': `''` |
+| reader_shards | kafka | job_options | True | True | 'reader_shards': `` |
+| store_raw_data | kafka | job_options | False | True | 'store_raw_data': True/False |
+| start_from | kafka | job_options | False | True | 'start_from': 'BEGINNING/NOW' |
+| end_at | kafka | job_options | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | kafka | job_options | True | True | 'compute_cluster': `''` |
+| run_parallelism | kafka | job_options | True | True | 'run_parallelism': `` |
+| content_type | kafka | job_options | True | True | 'content_type': 'AUTO/CSV/...' |
+| compression | kafka | job_options | False | True | 'compression': 'AUTO/GZIP/...' |
+| column_transformations | kafka | job_options | False | True | 'column_transformations': {`''` : `''` , ...} |
+| commit_interval | kafka | job_options | True | True | 'commit_interval': `''` |
+| skip_validations | kafka | job_options | False | True | 'skip_validations': ('MISSING_TOPIC') |
+| skip_all_validations | kafka | job_options | False | True | 'skip_all_validations': True/False |
+| comment | kafka | job_options | True | True | 'comment': `''` |
+| table_include_list | mysql | source_options | True | True | 'table_include_list': (`''`, ...) |
+| column_exclude_list | mysql | source_options | True | True | 'column_exclude_list': (`''`, ...) |
+| exclude_columns | mysql | job_options | False | True | 'exclude_columns': (`''`, ...) |
+| column_transformations | mysql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} |
+| skip_snapshots | mysql | job_options | True | True | 'skip_snapshots': True/False |
+| end_at | mysql | job_options | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | mysql | job_options | True | True | 'compute_cluster': `''` |
+| snapshot_parallelism | mysql | job_options | True | True | 'snapshot_parallelism': `` |
+| ddl_filters | mysql | job_options | False | True | 'ddl_filters': (`''`, ...) |
+| comment | mysql | job_options | True | True | 'comment': `''` |
+| table_include_list | postgres | source_options | False | False | 'table_include_list': (`''`, ...) |
+| column_exclude_list | postgres | source_options | False | True | 'column_exclude_list': (`''`, ...) |
+| heartbeat_table | postgres | job_options | False | True | 'heartbeat_table': `''` |
+| skip_snapshots | postgres | job_options | False | True | 'skip_snapshots': True/False |
+| publication_name | postgres | job_options | False | False | 'publication_name': `''` |
+| end_at | postgres | job_options | True | True | 'end_at': `'/NOW'` |
+| compute_cluster | postgres | job_options | True | True | 'compute_cluster': `''` |
+| comment | postgres | job_options | True | True | 'comment': `''` |
+| parse_json_columns | postgres | job_options | False | False | 'parse_json_columns': True/False |
+| column_transformations | postgres | job_options | False | True | 'column_transformations': {`''` : `''` , ...} |
+| snapshot_parallelism | postgres | job_options | True | True | 'snapshot_parallelism': `` |
+| exclude_columns | postgres | job_options | False | True | 'exclude_columns': (`''`, ...) |
+| location | s3 | source_options | False | False | 'location': `''` |
+| date_pattern | s3 | job_options | False | True | 'date_pattern': `''` |
+| file_pattern | s3 | job_options | False | True | 'file_pattern': `''` |
+| initial_load_pattern | s3 | job_options | False | True | 'initial_load_pattern': `''` |
+| initial_load_prefix | s3 | job_options | False | True | 'initial_load_prefix': `''` |
+| delete_files_after_load | s3 | job_options | False | True | 'delete_files_after_load': True/False |
+| deduplicate_with | s3 | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} |
+| end_at | s3 | job_options | True | True | 'end_at': `'/NOW'` |
+| start_from | s3 | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` |
+| compute_cluster | s3 | job_options | True | True | 'compute_cluster': `''` |
+| run_parallelism | s3 | job_options | True | True | 'run_parallelism': `` |
+| content_type | s3 | job_options | True | True | 'content_type': 'AUTO/CSV...' |
+| compression | s3 | job_options | False | True | 'compression': 'AUTO/GZIP...' |
+| comment | s3 | job_options | True | True | 'comment': `''` |
+| column_transformations | s3 | job_options | False | True | 'column_transformations': {`'