diff --git a/.github/ISSUE_TEMPLATE/improve-docs.yml b/.github/ISSUE_TEMPLATE/a-improve-docs.yml similarity index 98% rename from .github/ISSUE_TEMPLATE/improve-docs.yml rename to .github/ISSUE_TEMPLATE/a-improve-docs.yml index 57dc64cc312..70b173e49a4 100644 --- a/.github/ISSUE_TEMPLATE/improve-docs.yml +++ b/.github/ISSUE_TEMPLATE/a-improve-docs.yml @@ -39,4 +39,4 @@ body: label: Additional information description: Add any other context or screenshots about the feature request here. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9349000f66b..f3a3521bdec 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,5 @@ blank_issues_enabled: true contact_links: - - name: Want to see new content? Open a discussion! - url: https://github.com/dbt-labs/docs.getdbt.com/discussions/new - about: You can open a discussion to propose new content for the dbt product documentation. - name: Have questions about dbt? Join the Community! url: https://www.getdbt.com/community/join-the-community about: You can join the dbt Labs Community to ask and answer questions. diff --git a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml index f138b9e4e06..037da98dc6f 100644 --- a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml +++ b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml @@ -1,4 +1,4 @@ -name: Contribute to the dbt Developer Blog +name: Propose a dbt Developer Blog idea description: > For proposing a new post on the dbt Developer Blog. labels: ["content","developer blog"] diff --git a/.github/ISSUE_TEMPLATE/improve-the-site.yml b/.github/ISSUE_TEMPLATE/improve-the-site.yml index e0556d7374f..dd585324f89 100644 --- a/.github/ISSUE_TEMPLATE/improve-the-site.yml +++ b/.github/ISSUE_TEMPLATE/improve-the-site.yml @@ -1,6 +1,6 @@ -name: Improve the docs.getdbt.com site -description: Make a suggestion or report a problem about the technical implementation of docs.getdbt.com. -labels: ["engineering"] +name: Report a docs.getdbt.com site issue +description: Report a problem about the technical implementation of docs.getdbt.com. +labels: ["engineering","bug"] body: - type: markdown attributes: @@ -39,4 +39,4 @@ body: label: Additional information description: Any additional information, configuration, or data that might be necessary to reproduce the issue. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml b/.github/ISSUE_TEMPLATE/new-dbt-feature.yml deleted file mode 100644 index fa46a189fc4..00000000000 --- a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Start docs project for a new feature -description: For dbt PMs to add docs for their new or updated dbt product features. -labels: ["content","upcoming release"] -body: - - type: markdown - attributes: - value: | - * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). - * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). - - - type: checkboxes - id: contributions - attributes: - label: Contributions - description: This applies to new, unreleased content. - options: - - label: I am a PM or subject matter expert at dbt who is responsible for this feature. - - - type: textarea - attributes: - label: Where does this content belong? - description: | - - Give as much detail as you can to help us understand where you expect the content to live. - validations: - required: true - - - type: textarea - attributes: - label: Link to source material - description: | - Use the [source material template](https://docs.google.com/document/d/1lLWGMXJFjkY4p7r8ZKhBX73dOLmIjgXZBYq39LqmAJs/edit) to provide source material for this feature. - validations: - required: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml new file mode 100644 index 00000000000..e19accf6ebb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml @@ -0,0 +1,62 @@ +name: Add adapter to Trusted list +description: For adapter maintainers who wish to have theirs added to the list of Trusted adapters. +title: "Trust dbt-myadapter" +labels: ["adapter maintainers"] +assignees: + - dataders +body: + - type: markdown + attributes: + value: | + We're excited that you'd like to support your adapter formally as "Trusted"! This template will ensure that you are aware of the process and the guidelines. Additionally, that you can vouch that your adapter currently meets the standards of a Trusted adapter. For more information, see [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) + + - type: input + id: adapter-repo + attributes: + label: Link to adapter repo + description: Please link to the GitHub repo + validations: + required: true + + - type: input + id: contact + attributes: + label: Contact Details + description: How can we get in touch with you? + placeholder: your preferred email and/or dbt Slack handle + validations: + required: true + + - type: dropdown + id: author_type + attributes: + label: Which of these best describes you? + options: + - I am a dbt Community member + - I work for the vendor on top of which the dbt adapter functions + validations: + required: true + + - type: checkboxes + id: read-program-guide + attributes: + label: Please agree to the each of the following + options: + - label: I am a maintainer of the adapter being submited for Trusted status + required: true + - label: I have read both the [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) and [Building a Trusted Adapter](https://docs.getdbt.com/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter) pages. + required: true + - label: I believe that the adapter currently meets the expectations given above + required: true + - label: I will ensure this adapter stays in compliance with the guidelines + required: true + - label: I understand that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met + required: true + + - type: textarea + id: icon + attributes: + label: What icon should be used? + description: | + Please share an svg image that you'd like to be displayed in for your adapter. Normally, this is the logo for the data platform on top of which your adapter works. If there's a dark mode version, please also share that. + Pasting the image from your clipboard will upload the file to GitHub and create markdown formatting for it to be rendered inline diff --git a/.github/workflows/autogenerated_labeler.yml b/.github/workflows/autogenerated_labeler.yml new file mode 100644 index 00000000000..e6aab0492b8 --- /dev/null +++ b/.github/workflows/autogenerated_labeler.yml @@ -0,0 +1,40 @@ +# **what?** +# Labels issues autogenerated in dbt-core + +# **why?** +# To organize autogenerated issues from dbt-core to make it easier to find and track them. + +# **when?** +# When an issue is opened by the FishtownBuildBot + +name: Add Labels to Autogenerated Issues + +on: + issues: + types: [opened] + +jobs: + add_customized_labels: + if: github.event.issue.user.login == 'FishtownBuildBot' + permissions: + issues: write + + runs-on: ubuntu-latest + steps: + - name: "Determine appropriate labels by repo in title" + id: repo + env: + ISSUE_TITLE: ${{ github.event.issue.title }} + run: | + if [[ "$ISSUE_TITLE" == *"dbt-core"* ]]; then + echo "labels='content,improvement,dbt Core'" >> $GITHUB_OUTPUT + else + echo "labels='content,improvement,adapters'" >> $GITHUB_OUTPUT + fi + + - name: "Add Labels to autogenerated Issues" + id: add-labels + run: | + gh issue edit ${{ github.event.issue.number }} --repo ${{ github.repository }} --add-label ${{ steps.repo.outputs.labels }} + env: + GH_TOKEN: ${{ secrets.DOCS_SECRET }} diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml index 5ebef4f88ca..4de2203647f 100644 --- a/.github/workflows/label.yml +++ b/.github/workflows/label.yml @@ -2,7 +2,7 @@ name: Add/Remove Labels on: pull_request_target: - types: [ opened, closed ] + types: [ opened ] jobs: add_new_contributor_label: @@ -15,24 +15,32 @@ jobs: - uses: actions/github-script@v6 with: script: | - const creator = context.payload.sender.login + const creator = context.payload.sender.login; const opts = github.rest.issues.listForRepo.endpoint.merge({ ...context.issue, creator, - state: 'all' - }) - const issues = await github.paginate(opts) + state: 'all', + }); + + const issues = await github.paginate(opts); + + let isAlreadyContributor = false; + for (const issue of issues) { if (issue.number === context.issue.number) { - continue + continue; } - if (issue.pull_request) { - return // creator is already a contributor + if (issue.pull_request && issue.user.login === creator) { + isAlreadyContributor = true; + break; } } - await github.rest.issues.addLabels({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - labels: ['new contributor'] - }) + + if (!isAlreadyContributor) { + await github.rest.issues.addLabels({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + labels: ['new contributor'], + }); + } diff --git a/.gitignore b/.gitignore index b2746893814..74d338484aa 100755 --- a/.gitignore +++ b/.gitignore @@ -11,10 +11,14 @@ website/yarn.lock website/node_modules website/i18n/* -# Local vs code +# IDE configs .vscode +.idea + # Local Netlify folder .netlify -.vscode .eslintcache + +# Local Vercel folder +.vercel diff --git a/README.md b/README.md index 4dfd8a8be9e..da82ab45fd6 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ You can add code snippets and other content in a tabbed view. To learn more abou # Running the Docs site locally -You can click a link available in a netlify bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/): +You can click a link available in a Vercel bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/): ## Prerequisites diff --git a/contributing/single-sourcing-content.md b/contributing/single-sourcing-content.md index ca27372e5bc..7c345a6631a 100644 --- a/contributing/single-sourcing-content.md +++ b/contributing/single-sourcing-content.md @@ -15,9 +15,9 @@ Versions are managed in the `versions` array located in the `website/dbt-version ### Adding a new version -To add a new version to the site, a new object must be added to the `versions` array in the same format as existing versions. This object holds two properties: **version** and **EOLDate (See End of Life Dates below)**. +To add a new version to the site, a new object must be added to the `versions` array in the same format as existing versions. This object holds two properties: **version** and **EOLDate (See End of Life Dates below)**. -Example Version: +Example Version: ```jsx exports.versions = [ @@ -36,7 +36,7 @@ The **EOLDate** property determines when a version is no longer supported. A ver When a documentation page is viewed, the **EOLDate** property for the active version is compared to today’s date. If the current version has reached or is nearing the end of support, a banner will show atop the page, notifying the visitor of the end-of-life status. -Two different versions of the banner will show depending on the end-of-life date: +Two different versions of the banner will show depending on the end-of-life date: - When the version is within 3 months of the **EOLDate.** - When the version has passed the **EOLDate.** @@ -76,7 +76,7 @@ exports.versionedPages = [ ## Versioning blocks of content -The **VersionBlock** component provides the ability to version a specific piece of content on a docs page. +The **VersionBlock** component provides the ability to version a specific piece of content on a docs page. This component can be added directly to a markdown file in a similar way as other components (FAQ, File, Lightbox). @@ -90,7 +90,7 @@ This component can be added directly to a markdown file in a similar way as othe Both properties can be used together to set a range where the content should show. In the example below, this content will only show if the selected version is between **0.21** and **1.0**: ```markdown - + Versioned content here @@ -99,7 +99,7 @@ Both properties can be used together to set a range where the content should sho ### Example for versioning entire pages -On the [Docs Defer page](https://docs.getdbt.com/reference/node-selection/defer), tabs are used to show different versions of a piece of code. **v0.21.0 and later** shows `--select`, while **v-.20.x and earlier** changes this to `--models`. +On the [Docs Defer page](https://docs.getdbt.com/reference/node-selection/defer), tabs are used to show different versions of a piece of code. **v0.21.0 and later** shows `--select`, while **v-.20.x and earlier** changes this to `--models`. ![oldway](https://user-images.githubusercontent.com/3880403/163254165-dea23266-2eea-4e65-b3f0-c7b6d3e51fc3.png) @@ -149,7 +149,7 @@ Using a global variable requires two steps: exports.dbtVariables = { dbtCore: { name: "dbt Core" - } + } } ``` @@ -198,13 +198,13 @@ In the above example, the **dbtCloud** property has a default name of “dbt Clo ### Global variables example -The global `` component can be used inline, for example: +The global `` component can be used inline, for example: ```markdown This piece of markdown content explains why is awesome. ``` -However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. +However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. To use the component at the beginning of a sentence, add a non-breaking space character before the component: @@ -231,7 +231,7 @@ A partial file allows you to reuse content throughout the docs. Here are the ste 2. Go back to the docs file that will pull content from the partial file. 3. Add the following import file: `import ComponentName from '/snippets/_this-is-your-partial-file-name.md';` * You must always add an import file in that format. Note you can name `ComponentName` (a partial component) can be whatever makes sense for your purpose. - * `.md` needs to be added to the end of the filename. + * `.md` needs to be added to the end of the filename. 4. To use the partial component, go to the next line and add ``. This fetches the reusable content in the partial file * Note `anyname` can be whatever makes sense for your purpose. @@ -258,15 +258,15 @@ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam fermentum portti ```markdown Docs content here. -`import SetUpPages from '/snippets/_partial-name.md';` - - +import SetUpPages from '/snippets/_partial-name.md'; + + Docs content here. ``` - `import SetUpPages from '/snippets/_partial-name.md';` — A partial file that will be imported by other files -- `` — A component that imports content from the partial file. You can also use it to pass in data into the partial using props (See 'How to use props to pass different content on multiple pages?' below). +- `` — A component that imports content from the partial file. You can also use it to pass in data into the partial using props (See 'How to use props to pass different content on multiple pages?' below). 4. This will then render the content of the docs in the partial file. @@ -276,32 +276,32 @@ Docs content here.
How to use props to pass different content on multiple pages?
- + You can add props on the component only if you want to pass in data from the component into the partial file. This is useful for using the same partial component on multiple docs pages and displaying different values for each. For example, if we wanted to use a partial on multiple pages and pass in a different 'feature' for each docs page, you can write it as: -``` +```markdown import SetUpPages from '/snippets/_available-enterprise-only.md'; - -` + + ``` - + Then in the `/snippets/_available-enterprise-only.md file`, you can display that feature prop with: - + >This feature: `{props.feature}` other content etc... This will then translate to: - + >This feature: A really cool feature other content etc... In this example, the component ` ### Snippets -The Snippet component allows for content to be reusable throughout the Docs. This is very similar to the existing FAQ component. Using partial files, which is a built-in Docusaurus feature, is recommended over snippets. +The Snippet component allows for content to be reusable throughout the Docs. This is very similar to the existing FAQ component. Using partial files, which is a built-in Docusaurus feature, is recommended over snippets. Creating and using a snippet requires two steps: diff --git a/netlify.toml b/netlify.toml deleted file mode 100644 index 6ab92757410..00000000000 --- a/netlify.toml +++ /dev/null @@ -1,2 +0,0 @@ -[build] - functions = "functions" diff --git a/website/.gitignore b/website/.gitignore index ee62cc96f39..9d56e23a488 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -26,4 +26,7 @@ yarn-error.log* # feeds /static/feeds/atom.xml /static/feeds/rss.json -/static/feeds/rss.xml \ No newline at end of file +/static/feeds/rss.xml + +# Local Vercel folder +.vercel diff --git a/website/api/get-discourse-comments.js b/website/api/get-discourse-comments.js new file mode 100644 index 00000000000..5ac59cfe5f2 --- /dev/null +++ b/website/api/get-discourse-comments.js @@ -0,0 +1,169 @@ +const axios = require('axios') +require("dotenv").config(); + +const { DISCOURSE_DEVBLOG_API_KEY , DISCOURSE_USER_SYSTEM } = process.env +const DEVBLOG_PROD_URL = 'https://docs.getdbt.com/blog/' +const DEV_ENV = 'dev-' +const PREVIEW_ENV = 'deploy-preview-' + +// Set API endpoint and headers +let discourse_endpoint = `https://discourse.getdbt.com` +let headers = { + 'Accept': 'application/json', + 'Api-Key': DISCOURSE_DEVBLOG_API_KEY, + 'Api-Username': DISCOURSE_USER_SYSTEM, +} + +async function getDiscourseComments(request, response) { + let topicId, comments, DISCOURSE_TOPIC_ID; + + const blogUrl = await getBlogUrl(request) + + if (blogUrl === DEVBLOG_PROD_URL) { + DISCOURSE_TOPIC_ID = 21 + } else { + DISCOURSE_TOPIC_ID = 2 + } + + try { + const env = + blogUrl === DEVBLOG_PROD_URL + ? "" + : blogUrl.includes("localhost") + ? DEV_ENV + : PREVIEW_ENV; + const postTitle = `${env}${request.query.title}`; + const postSlug = request.query.slug; + const cleanSlug = cleanUrl(request.query.slug); + const externalId = truncateString(`${env}${cleanSlug}`); + + console.table({ + blogUrl, + postTitle, + postSlug, + cleanSlug, + externalId, + }); + + + if (!postSlug) throw new Error("Unable to query Discourse API. Error reading slug."); + + topicId = await searchDiscourseExternalId(externalId); + + // First check if the dev blog post exists in Discourse + // Get the comments if it does + if (typeof topicId === "number") { + comments = await getDiscourseTopicbyID(topicId); + } else { + // If the dev blog post does not exist in Discourse + // Create a new topic and get the comments + topicId = await createDiscourseTopic(postTitle, externalId, cleanSlug, blogUrl, DISCOURSE_TOPIC_ID); + if (typeof topicId === "number") { + comments = await getDiscourseTopicbyID(topicId); + comments.shift(); + comments = { topicId, comments }; + + return await response.status(200).json(comments); + } else { + console.log("Unable to create Discourse topic TopicID is not a number."); + return await response.status(500).json({ error: "Unable to create Discourse topic TopicID is not a number." }); + } + } + + comments.shift(); + comments = { topicId, comments }; + + return await response.status(200).json(comments); + } catch (err) { + console.log("err on getDiscourseComments", err); + return await response.status(500).json({ error: "Unable to get topics from Discourse." }); + } +} + +async function createDiscourseTopic(title, externalId, slug, blogUrl, DISCOURSE_TOPIC_ID) { + console.log(`Creating a new topic in Discourse - ${title}`) + try { + const response = await axios.post(`${discourse_endpoint}/posts`, { + title: title, + raw: `This is a companion discussion topic for the original entry at ${blogUrl}${slug}`, + category: DISCOURSE_TOPIC_ID, + embed_url: `${blogUrl}${slug}`, + external_id: externalId, + tags: ['devblog'], + visible: false + }, { headers }) + + let topicId = await response.data.topic_id + + console.log('Topic successfully created with topic_id', topicId) + + return topicId + + } catch(err) { + console.log('err on createDiscourseTopic', err) + return err + } +} + +async function getDiscourseTopicbyID(topicId) { + console.log(`Topic found setting topic id - ${topicId}`) + try { + let response = await axios.get(`${discourse_endpoint}/t/${topicId}.json`, { headers }) + let { data } = await response + let post_stream = data.post_stream + let post_count = data.posts_count + + // If there is more than one comment make the topic visibile in Discourse + if (post_count > 1 && data.visible === false) { + console.log(`Topic has more than one comment. Changing visibility to visible.`) + await axios.put(`${discourse_endpoint}/t/${topicId}`, { + visible: true + }, { headers }) + } + + // Filter only 'regular' posts in Discourse. (e.g. not moderator actions, small_actions, whispers) + post_stream.posts = post_stream.posts.filter(post => post.post_type === 1) + + return post_stream.posts + } catch(err) { + console.log('err on getDiscourseTopicbyID', err) + return err + } +} + +async function searchDiscourseExternalId(externalId) { + console.log(`Searching for external_id in Discourse - ${externalId}`); + try { + const data = await axios.get(`${discourse_endpoint}/t/external_id/${externalId}.json`, { headers }); + return data.data.id; + } catch (err) { + if (err.response.status === 404) { + console.log("No topics found in Discourse."); + return null; + } + console.log("Unable to search Discourse for external_id.", err); + return err; + } +} + + +// Truncate external_id to 50 characters per Discourse API requirements +function truncateString(str) { + if (str.length <= 50) { + return str + } + return str.slice(0, 50) +} + +// Remove query params and hash from URL to prevent duplicate topics +function cleanUrl(url) { + return url.split("?")[0].split("#")[0]; +} + +// Create a function to get the host name from the request and add /blog/ to the end +async function getBlogUrl(req) { + const host = req.headers.host + return `https://${host}/blog/` +} + +module.exports = getDiscourseComments; diff --git a/website/api/get-discourse-topics.js b/website/api/get-discourse-topics.js new file mode 100644 index 00000000000..90d6e5af80e --- /dev/null +++ b/website/api/get-discourse-topics.js @@ -0,0 +1,136 @@ +const axios = require('axios') + +async function getDiscourseTopics(request, response) { + const { DISCOURSE_API_KEY , DISCOURSE_USER } = process.env + + const body = request.body + + try { + // Set API endpoint and headers + let discourse_endpoint = `https://discourse.getdbt.com` + let headers = { + 'Accept': 'application/json', + 'Api-Key': DISCOURSE_API_KEY, + 'Api-Username': DISCOURSE_USER, + } + + const query = buildQueryString(body) + if(!query) throw new Error('Unable to build query string.') + + // Get topics from Discourse + let { data: { posts, topics } } = await axios.get(`${discourse_endpoint}/search?q=${query}`, { headers }) + + // Return empty array if no topics found for search query + // 200 status is used to prevent triggering Datadog alerts + if(!topics || topics?.length <= 0) { + // Log message with encoded query and end function + console.log('Unable to get results from api request.') + console.log(`Search query: ${query}`) + return await response.status(200).json([]) + } + + // Set author and like_count for topics if not querying by specific term + let allTopics = topics + if(!body?.term) { + allTopics = topics.reduce((topicsArr, topic) => { + // Get first post in topic + const firstTopicPost = posts?.find(post => + post?.post_number === 1 && + post?.topic_id === topic?.id + ) + // If post found + // Get username + if(firstTopicPost?.username) { + topic.author = firstTopicPost.username + } + // Get like count + if(firstTopicPost?.like_count) { + topic.like_count = firstTopicPost.like_count + } + + if(firstTopicPost?.blurb) { + topic.blurb = firstTopicPost.blurb + } + + // Push updated topic to array + topicsArr.push(topic) + + return topicsArr + }, []) + } + + // Return topics + //return await returnResponse(200, allTopics) + return await response.status(200).json(allTopics) + } catch(err) { + // Log and return the error + console.log('err', err) + return await response.status(500).json({ error: 'Unable to get topics from Discourse.'}) + } +} + +function buildQueryString(body) { + if(!body) return null + + // start with empty query string + let query = '' + + // check param and apply to query if set + for (const [key, value] of Object.entries(body)) { + // validate categories + // if valid, add to query string + if(validateItem({ key, value })) { + if(key === 'category') { + query += `#${value} ` + } else if(key === 'inString') { + query += `in:${value}` + } else if(key === 'status' && Array.isArray(value)) { + value?.map(item => { + query += `${key}:${item} ` + }) + } else { + query += `${key}:${value} ` + } + } + } + + if(query) { + const encodedQuery = encodeURIComponent(query) + return encodedQuery + } +} + +function validateItem({ key, value }) { + // predefined Discourse values + // https://docs.discourse.org/#tag/Search/operation/search + const inStringValues = ['title', 'first', 'pinned', 'wiki'] + const orderValues = ['latest', 'likes', 'views', 'latest_topic'] + const statusValues = ['open', 'closed', 'public', 'archived', 'noreplies', 'single_user', 'solved', 'unsolved'] + + // validate keys + if(key === 'inString') { + return inStringValues.includes(value) + ? true + : false + } else if(key === 'order') { + return orderValues.includes(value) + ? true + : false + } else if(key === 'status') { + if(Array.isArray(value)) { + let isValid = true + value?.map(item => { + if(!statusValues.includes(item)) isValid = false + }) + return isValid + } else { + return statusValues.includes(value) + ? true + : false + } + } else { + return true + } +} + +module.exports = getDiscourseTopics diff --git a/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md b/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md index 944d6fdd3f9..cdfd4da5f5d 100644 --- a/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md +++ b/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md @@ -13,6 +13,13 @@ date: 2020-07-01 is_featured: false --- +:::caution More up-to-date information available + +Since this blog post was first published, many data platforms have added support for [materialized views](/blog/announcing-materialized-views), which are a superior way to achieve the goals outlined here. We recommend them over the below approach. + + +::: + Before I dive into how to create this, I have to say this. **You probably don’t need this**. I, along with my other Fishtown colleagues, have spent countless hours working with clients that ask for near-real-time streaming data. However, when we start digging into the project, it is often realized that the use case is not there. There are a variety of reasons why near real-time streaming is not a good fit. Two key ones are: 1. The source data isn’t updating frequently enough. diff --git a/website/blog/2021-09-15-september-21-product-email.md b/website/blog/2021-09-15-september-21-product-email.md index c18f59a9be5..a3c9993befa 100644 --- a/website/blog/2021-09-15-september-21-product-email.md +++ b/website/blog/2021-09-15-september-21-product-email.md @@ -4,7 +4,6 @@ description: "dbt v1.0 is coming up! Don't forget to update your projects to the slug: dbt-product-update-2021-september authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-09-15 diff --git a/website/blog/2021-10-15-october-21-product-update-email.md b/website/blog/2021-10-15-october-21-product-update-email.md index 9e58514c50e..c235e43bf43 100644 --- a/website/blog/2021-10-15-october-21-product-update-email.md +++ b/website/blog/2021-10-15-october-21-product-update-email.md @@ -4,7 +4,6 @@ description: "Stay up-to-date with the latest features in dbt. Read about our Oc slug: dbt-product-update-2021-october authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-10-15 diff --git a/website/blog/2021-11-15-november-21-product-email.md b/website/blog/2021-11-15-november-21-product-email.md index d38685aad53..dd5d2b63956 100644 --- a/website/blog/2021-11-15-november-21-product-email.md +++ b/website/blog/2021-11-15-november-21-product-email.md @@ -4,7 +4,6 @@ description: "Stay up-to-date with the latest features in dbt. Read about our No slug: dbt-product-update-2021-november authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-11-15 diff --git a/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md b/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md deleted file mode 100644 index 3615a6204d6..00000000000 --- a/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md +++ /dev/null @@ -1,173 +0,0 @@ ---- -title: "Understanding the components of the dbt Semantic Layer" -description: "Heard about dbt Metrics or the dbt Semantic Layer and curious to give them a try? Callum McCann digs into what they are, walks through an example, and discusses how they all fit together!" -slug: understanding-the-components-of-the-dbt-semantic-layer - -authors: [callum_mccann] - -tags: [dbt product updates] -hide_table_of_contents: false - -date: 2022-07-27 -is_featured: true ---- - -# Getting started with the dbt Semantic Layer - -> TLDR: The Semantic Layer is made up of a combination of open-source and SaaS offerings and is going to change how your team defines and consumes metrics. - -At last year's Coalesce, Drew showed us the future[^1] - a vision of what metrics in dbt could look like. Since then, we've been getting the infrastructure in place to make that vision a reality. We wanted to share with you where we are today and how it fits into the broader picture of [where we're going](https://www.getdbt.com/blog/dbt-semantic-layer). - -To those who haven't followed this saga with the intensity of [someone watching their investments on the crypto market](https://mobile.twitter.com/scannergr1/status/1536198701215109122/photo/1), we're rolling out this new resource to help you better understand the dbt Semantic Layer and provide clarification on the following things: - -1. What is the dbt Semantic Layer? -2. How do I use it? -3. What is publicly available now? -4. What is still in development? - -With that, lets get into it! - - - -> Some of you might have been around when this was initially being referred to as the Metrics Layer. As we evaluated the long term plans for what this part of dbt was going to become, we realized that naming it the Semantic Layer better reflected its capabilities and where we plan on taking it. - -## What is the dbt Semantic Layer? - -The dbt Semantic Layer is a new part of dbt to help improve precision and consistency while expanding flexibility and capability in the modern data stack. Our maestro of metrics, Drew Banin, [released a blog post detailing the vision of where we're going here](https://www.getdbt.com/blog/dbt-semantic-layer). The first use case that we are addressing is one that most practicioners **and** stakeholders are familiar with - metrics. We'll walk through what this looks like in practice later on in this post. - -Under the hood, the dbt Semantic layer is collection of several components - some of these are part of dbt Core, some part of dbt Cloud, and some are net new functionality. They all [combine together like Voltron](https://www.youtube.com/watch?v=5rPSLQxMT8w) to create a single experience through which business users can query data in the context of the metric that is most familiar to them. And the best part is that they can do it in systems they are already comfortable using. - -***What will this look like for my data consumers and business stakeholders?*** - -Ultimately, this looks like people being able to interact with trusted datasets in the tools that they are comfortable with (and eventually new tools designed specifically around metrics). - -An example that we’ve found helpful is [ARR](https://www.zuora.com/billing-topics/annual-recurring-revenue/#:~:text=Annual%20Recurring%20Revenue%2C%20or%20ARR,for%20a%20single%20calendar%20year). A business-critical metric to SaaS companies, ARR can be a tricky calculation to keep consistent across all of the tools used in the business. With the dbt Semantic Layer, this definition would live in dbt and the logic to create the dataset for that metric would be consistent across all different consuming experiences. Best of all, definition changes would get reflected in downstream tools, so you no longer need to manually search and update every downstream dependency. Callum of 3 years ago is jumping with joy. - -***That’s good and all, but what does this look like for practitioners to use?*** - -The dbt Semantic layer is comprised of the following components[^2]: - -**Available Today** - -- **[`metric` node in dbt Core :](/docs/build/metrics)** Similar to `models` or `sources` , this is a specific node type in dbt Core. It is the definition of a time-series aggregation over a table that supports zero or more dimensions. The resulting node is stored in the `manifest.json` just like `models` and referenced in the DAG. -- **[`dbt_metrics` package:](https://github.com/dbt-labs/dbt_metrics)** this package provides macros that combine the version-controlled metric definition and query-time parameters (like dimensions, a time grain, and secondary calculations) to generate a SQL query which calculates the metric value. -- **[dbt Cloud Metadata API](https://docs.getdbt.com/docs/dbt-cloud-apis/metadata-api):** a GraphQL API which supports arbitrary queries over the metadata produced by dbt Cloud jobs. Contains metadata related to the accuracy, recency, configuration, and structure of the views and tables in the warehouse, as well as much more. - -**New** - -- **dbt Server:** this component wraps dbt Core in a persistent server that is responsible for handling RESTful API requests for dbt operations. It’s a thin interface that is primarily responsible for performance and reliability in production environments. -- **dbt Cloud proxy server:** this component enables dbt Cloud to dynamically rewrite requests to a data warehouse and compile dbt-SQL into raw SQL that the database understands. It then returns the dataset produced by the raw SQL to the platform that sent it. - -![Untitled](/img/blog/2022-07-27-getting-started-with-the-dbt-semantic-layer/semantic-layer-description.png) - -### Understanding how and when to use metrics? - -> Use of metrics and the metrics package is recommended for experienced dbt users and early adopters who want to explore this functionality. - -Let's walk through an example of how you can use the components above to get started today using our old friend - [the Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics). We'll take a look at how you can start defining and testing metrics today as well as how you'll interact with them once the new components are released. - -**When to use Metrics** - -The first question you need to ask is, *Should we be using metrics?* - -It is our belief that metrics are not a one-size fits all solution. They are designed for core business metrics where consistency and precision are of key importance, not for exploratory use cases or ad hoc analysis. Our shorthand way of determining whether the metric should be defined in dbt has been - *is this something our teams need to report on?* - -So, let’s say the CFO of our Jaffle comes to us on a Monday morning and commands the data team to overhaul how we're reporting on Revenue. Our Regional Manager Jim and Sales Director Pam[^3] have been giving him different reports! Right now its a mess of tools and inconsistencies - Jim’s numbers are defined in Tableau and say one thing, Pam’s within Hex and say another! The CFO is frustrated with it and wants a cohesive experience across the company where everyone has the same numbers for revenue. It passes the report test, it’s an important business metric; away we go! - -**Defining the Metric with Metric Node** - -In this example, we’ll say that both Jim and Pam are pulling from a table created by dbt called `orders`. It currently contains fields for `amount` and all different methods of payment_amounts, such as credit cards or gift cards. Jim has been calculating revenue by summing up the `credit_card_amount` and `gift_card_amount` fields, as he forgot to update his definition when the business added coupons and bank transfers payments. Meanwhile, Pam is correctly summing the `amount` field but hasn’t accounted for return orders that shouldn’t be counted! - -The first step is creating a unified definition for what revenue is. In order to do this, we will create the following yml definition within our dbt repo: - -```yaml -version: 2 - -metrics: - - name: revenue - label: Revenue - model: ref('orders') - description: "The total revenue of our jaffle business" - - type: sum - sql: amount - - timestamp: order_date - time_grains: [day, week, month, year] - - dimensions: - - customer_status - - has_coupon_payment - - has_bank_transfer_payment - - has_credit_card_payment - - has_gift_card_payment - - filters: - - field: status - operator: '=' - value: "'completed'" -``` - -This metric has now been defined in the dbt metadata and can be seen in the DAG! - -![Untitled](/img/blog/2022-07-27-getting-started-with-the-dbt-semantic-layer/metrics-dag.png) - -**Running The Metric Package To calculate the metric** - -In order to ensure that both Jim and Pam are retrieving the same numbers for their metric, we’ll need them to both run a metrics `calculate` query. In this example, we’re not interested in the specific payment types and only want to see revenue broken up by `week` and `customer_status`. - -```sql -select * -from {{ metrics.calculate( - metric('revenue'), - grain='week', - dimensions=['customer_status'] -) }} -``` -This would return a dataset that looks like this: - -| date_week | customer_status | revenue | -| --- | --- | --- | -| 2018-01-01 | Churn Risk | 43 | -| 2018-01-01 | Churned | 0 | -| 2018-01-01 | Healthy | 26 | -| 2018-01-08 | Churn Risk | 27 | - -Jim and Pam would then be able to reference the `revenue` column within the newly created dataset and never have to worry about the calculation of revenue ever again[^4]! The world is perfect and [balance has been restored.](https://www.youtube.com/watch?v=d1EnW4kn1kg) - -**In the near future with dbt Server** - -When dbt Server releases later this year, the flow of how metrics are consumed will change significantly. Your organization will no longer need to materialize each metric within a model in order to take advantage of the metric definition. Instead, you’ll be able to directly query dbt Server with the metric code provided and have the correct dataset returned to your BI tool of choice. - -Additionally, integration partners will have built out experiences around Metrics using the Metadata API to create unique and creative ways for consumers to obtain metric data while abstracting away complexity. For example, a box that allows the user to select from a list of metrics, time grains, dimensions, and secondary calculation and then have the correct information returned to them regardless of the selection! - -### So what is publicly available now? - -Right now, the two main open-source components that are publicly available are the [`metric` node](/docs/build/metrics) within dbt Core and the `dbt_metrics` package. Combined, these two can operate an introductory semantic layer experience by allowing analytics engineers to define metrics and then query that metric via the metrics package. - -These two components are a static experience that have to be defined in the dbt project (as the selected dimensions are defined at model creation) but are useful for those who want to ensure that metrics remain consistent across every BI tool. If you identify with any of the following conditions, you could be a good fit for implementing this as it exists today: - -- You want to prepare your organization for the full Semantic Layer launch. -- Your organization has at least a few key metrics -- Your organization uses 1 or more BI tools -- Your organization occasionally has issues around different metric calculations -- Your organization wants a centralized location for all metrics so everyone in the business knows where to look - -All of these are great reasons to begin exploring implementing metrics in your dbt project! If you’re curious about what an implementation of this might look like, we recommend referencing the [jaffle_shop_metrics](https://github.com/dbt-labs/jaffle_shop_metrics) repo! - -### What is still in development? - -Both the dbt Cloud proxy server and dbt Server are currently in development, with a scheduled release of later this year. If you’re curious about testing them once they are released, we recommend keeping an eye on our product announcements and then reaching out once they become publicly available! - -### What if I have questions? - -If you have any questions about those components, or metrics in general, please feel free to post in the #dbt-metrics-and-server channel on dbt Slack! I hang around there and am always willing to chat metrics! - -### Footnotes -[^1]: That future may not have mentioned robots but I'm holding out for [Jetson's style morning machine](https://www.youtube.com/watch?v=-0S3Jf-NxdI) to help me get ready in the morning. - -[^2]: We’re specifically calling out the licensing because there is a lot of confusion in the community around what is open-source and what isn’t. This is only becoming trickier with the introduction of the BSL licensing, which ensures users can run their own server but it cannot be sold as a cloud service. For more information on why these licensing types were picked, we recommend [Tristan’s blog around licensing dbt.](https://www.getdbt.com/blog/licensing-dbt/). The big takeaway around licensing is that you can still run components of the dbt Semantic Layer even if you aren’t a dbt Cloud customer! - -[^3]: Full transparency, I've never seen the Office. The awkward humor makes me so uncomfortable that I have to turn off the TV. Apologies if the titles of the characters are incorrect. - -[^4]: Psych! They’re definitely interested in the calculation of ARR. In fact, they don’t really trust the numbers **unless** they understand how it’s calculated. This is where they could use the Metadata API in order to query all the information about the metric, such as definition, run-time, acceptable dimensions, etc. Right now Jim and Pam would need to query the API directly but in the future we expect there to be a number of different ways to obtain this information, ranging from [direct integration with the BI tool](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration) all the way to having that information materialized in a dbt information schema! *For current tabular alternatives, there are some interesting macros in the newly released [dbt-project-evaluator package](https://github.com/dbt-labs/dbt-project-evaluator). Take a look there if you’re curious about materializing your metric information!* \ No newline at end of file diff --git a/website/blog/2022-08-31-august-product-update.md b/website/blog/2022-08-31-august-product-update.md index 143d46a37d3..cb4077f3a06 100644 --- a/website/blog/2022-08-31-august-product-update.md +++ b/website/blog/2022-08-31-august-product-update.md @@ -4,7 +4,6 @@ description: "Coalesce is less than 2 months away!" slug: dbt-product-update-2022-august authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2022-08-31 diff --git a/website/blog/2022-10-12-how-to-design-and-structure-metrics.md b/website/blog/2022-10-12-how-to-design-and-structure-metrics.md deleted file mode 100644 index 4f738543dff..00000000000 --- a/website/blog/2022-10-12-how-to-design-and-structure-metrics.md +++ /dev/null @@ -1,394 +0,0 @@ ---- -title: "How to design and structure dbt metrics: Recommendations for getting started" -description: "The introduction of the dbt Semantic Layer expands what users can do with dbt but introduces a familiar questions around where logic should live. Read along as the dbt Labs team talks about best practices through the lens of two different examples!" -slug: how-to-design-and-structure-metrics - -authors: [callum_mccann] - -tags: [dbt product updates] -hide_table_of_contents: false - -date: 2022-10-12 -is_featured: true ---- - ---- - -**IMPORTANT:** This document serves as the temporary location for information on how to design and structure your metrics. It is our intention to take this content and turn it into a Guide, like [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), but we feel that codifying information in a Guide first requires that metrics be rigorously tested by the community so that best practices can arise. This document contains our early attempts to create best practices. In other words, read these as suggestions for a new paradigm and share in the community where they do (or don’t) match your experiences! You can find more information on where to do this at the end. - ---- - -## The power of a semantic layer on top of a mature data modeling framework - -As a longtime [dbt Community](https://www.getdbt.com/community/join-the-community/) member, I knew I had to get involved when I first saw the dbt Semantic Layer in the now infamous [`dbt should know about metrics` Github Issue](https://github.com/dbt-labs/dbt-core/issues/4071). It gave me a vision of a world where metrics and business logic were unified across an entire organization; a world where the data team was no longer bound to a single consuming experience and could enable their stakeholders in dozens of different ways. To me, it felt like the opportunity to contribute to the next step of what dbt could become. - -In past roles, I’ve been referred to as the `dbt zealot` and I’ll gladly own that title! It’s not a surprise - dbt was built to serve data practitioners expand the power of our work with software engineering principles. It gave us flexibility and power to serve our organizations. But I always wondered if there were more folks who could directly benefit from interacting with dbt. - -The Semantic Layer expands the reach of dbt **by coupling dbt’s mature data modeling framework with semantic definitions.** The result is a first of its kind data experience that serves both the data practitioners writing your analytics code and stakeholders who depend on it. Metrics are the first step towards this vision, allowing users to version control and centrally define their key business metrics in a single repo while also serving them to the entire business. - -However, this is still a relatively new part of the dbt toolbox and you probably have a lot of questions on how exactly you can do that. This blog contains our early best practice recommendations for metrics in two key areas: -- **Design**: What logic goes into metrics and how to use calculations, filters, dimensions, etc. -- **Structure**: Where these metrics will live in your dbt project and how to compose the files that contain your metrics - -We developed these recommendations by combining the overall philosophy of dbt, with our hands-on learning gathered during the beta period and internal testing. - - - -**Pre-reading:** We recommend reading through the [metrics documentation](/docs/build/metrics), which contains a table of all the required/optional properties. - -### When to put business logic in the semantic layer vs the modeling layer - -Our instinct when designing metrics might be to encode as much information as possible into the semantic layer. An example of this is case statements - the analytics engineer’s gut instinct might be to mimic tools of the past and provide complicated case statements for the metric `expression` property to try and capture the nuance of how it should be calculated. - -But remember - you always have the option of performing this logic _in the modeling layer_. This is the key difference between dbt and other semantic layer offerings - by sitting the semantic layer atop a mature transformation layer, you always have the option to configure and optimize your logic within your models and then _define semantic components with intentionality_. - -Getting the balance just right is a learning experience and developing community best practices and standards will take time, which is why it’s important for us to think from first principles. What should really be our goal when determining whether logic lives in a model or a metrics? - -To explore this question and begin to develop an intuition, we’ll walk through two examples of handling this divide. - -## Basic example: Revenue - -### Designing your metrics - -In this example, we’ll cover the basics of defining a metric and a fairly straightforward example of where users can draw the line between the semantic layer and the modeling layer. You should finish this section with a better understanding of dbt metrics and its relationship to the modeling layer. - -In the past, the `marts` tables were often your end stage layer before data was consumed in another tool or system. Now, the mart is the springboard for the creation of our metric. So we'll begin by looking our end-state `marts` model called `order_events` that looks something like the below table, but on the order of millions of rows instead of five. Our finance team uses the below model to better understand revenue but inconsistencies in how it's reported have led to requests that the data team centralize the definition in the dbt repo. - -| event_date | order_id | order_country | order_status | customer_id | customer_status | amount | -| --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | United States | completed | 19 | Healthy | 10 | -| 2022-10-01 | 2 | France | completed | 36 | Churn Risk | 15 | -| 2022-10-02 | 2 | France | returned | 36 | Churned | 15 | -| 2022-10-02 | 3 | Turkey | completed | 20 | Healthy | 80 | -| 2022-10-03 | 4 | Korea | completed | 14 | Churn Risk | 24 | - -### Logic in the modeling layer vs the semantic layer - -We know from our upstream dbt models that the `amount` field represents the revenue from from each order. The inconsistent reporting, however, has arisen because the correct definition of revenue only refers to orders that are completed, not returned. Some teams aren’t familiar with this additional filter and it has led to company wide misreporting. - -The solution is to use the flexibility of the dbt modeling layer to add a boolean field called `is_active_row` that shows whether or not the row in question is the most recent version. With this, we can understand and filter out duplicate rows that may be connected to the same order. - -Once we have this field, we reach a diverging path: - -- If we are not interested in seeing the history of `order_events` , we can add a `where` clause **to the model itself**. This would ensure there is only one row per order. -- If we **are** interested in seeing the history of `order_events` , we can add a `filter` to the metric definition to ensure that these duplicate order rows don’t cause us to misreport revenue - -Both of these paths ensure that only the correct orders are included in the metric calculation but one does it at the modeling layer and the other the semantic layer. There is no **best** path here - it depends on your organization's reporting and analytics needs. For this example, we’ll say that our business isn’t interested in understanding orders that have gone from completed to returned and so we’ll use option one moving forward. Now lets define the metric: - -```yaml -version: 2 -metrics: - - name: total_revenue - label: The revenue of our business - model: ref('order_events') - description: "The revenue for our business, as defined by Jerry in Finance" - - calculation_method: sum - expression: amount - - timestamp: event_date - time_grains: [day, week, month, all_time] - - dimensions: - - customer_status - - order_country - - ## We don't need this section because we chose option 1 - ## filters: - ## - field: order_status - ## operator: '=' - ## value: 'completed -``` - -Each of the properties of the above definition are defined [in the metrics documentation](/docs/build/metrics), but let’s dig into the two that might require some additional explanation. The two in question are `expression` and `dimensions`. - -In plain english, the `expression` property is the sql column (or expression) that we are applying the calculation method on. In our example above, this simplifies to `sum(amount)`. However, this doesn’t **need** to be a field in the model. It could also be a sql expression like `case when condition = true then field else 0 end` . - -And then there’s `dimensions`. - -### Choosing which dimensions to use with your metric - -The `dimensions` attribute is a bit more nuanced than the others because it involves curating the ways through which a user can interact with the metric. To that end … - -❗ **We recommend curating dimensions, not including all columns within the model. Most models contain dimensions that aren’t relevant for end-user analysis.** - -What do we mean? Well, there is a lot of nuance in what constitutes a useful or less useful dimension that is dependent on the shape of the underlying data and the ways with which the metric will be consumed. Continuing with our revenue use case, here are some examples: - -- **Useful Dimensions:** - - `customer_status`: This field is helpful to end users because it allows them to break down the revenue generated by each customer status grouping. Members of the retention team might be interested in understanding the long-term trends of revenue from the Churn Risk group so that they can better understand the impact that their retention initiatives campaigns have had. - - `order_country`: This field is useful because it allows members of the finance team to break down the accepted revenue from each country of origin so that they can better understand which countries are experiencing the highest growth. -- **Less Useful Dimensions:** - - `order_status` : Given that order_status is a part of the metric definition, it doesn’t make sense to include in the acceptable dimensions list because the value returned would always be `completed`. - - `order_id`: Each order id corresponds to a single order and a single point in time. Grouping the metric of revenue by order_id would just return the base grain of the table and the same value as the amount field - not useful from a metric perspective! -- **Nuanced Dimensions:** - - `customer_id`: This is an interesting field because it can be both good and bad depending on the context in which it is used and the underlying data. In our example use case, this dimension wouldn’t be that useful - it would contain too many unique values and tracking the individual revenue impact by a single customer doesn’t make sense on a retail scale. - - In a SaaS business though, it might make more sense - especially with usage based pricing. The Customer Success team might be interested in tracking the revenue of certain customers and ensuring that they remain consistent. - -To quote Cameron Afzal, Product Manager of the dbt Semantic Layer: - -> Thoughtful curation of dimensions is essential for three main reasons: -- **Relevance:** Analysts must include the dimensions most relevant to answering the question. -- **Trust**: Curating high-quality dimensions with little to no known errors helps ensure trust in analysis results and the decisions that follow. -- **Efficiency**: Curation provides a faster path to high-quality analysis results. -> - -To put it another way, **metrics are most useful when every dimension provided can help provide answers to the business.** - -## Advanced example: NPS - -### Designing a complex metric - -Now let’s look at a more complex example of a metric - one that is built from components that could theoretically themselves be metrics. The metric in question is Net Promoter score, which is used by the dbt Labs internal analytics team to understand the experience that users are having on dbt Cloud. - -For those of you who are unfamiliar with the industry metric of Net Promoter Score, here is a [great article from the folks over at Delighted on how it is calculated.](https://delighted.com/net-promoter-score) The short version of it is `the percentage of promoters - the percentage of detractors`. - ---- - -Here at dbt Labs we provide users with short surveys where they can provide feedback (as well as in a few other locations). The data is collected from those surveys is used to calculate our NPS Score, which helps us understand user sentiment over time. - -Given that these surveys come from a few different sources, there is a large amount of upstream modeling performed in order to unify them in a single model, but the end result is something that looks like the table below: - -| feedback_date | unique_id | feedback_source | user_type | account_plan | score | nps_category | -| --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | nps_tool_1 | developer | team | 5 | detractor | -| 2022-10-01 | 2 | nps_tool_2 | read_only | developer | 8 | promoter | -| 2022-10-02 | 3 | nps_tool_1 | developer | enterprise | 10 | promoter | -| 2022-10-02 | 4 | nps_tool_1 | developer | developer | 7 | passive | -| 2022-10-02 | 5 | nps_tool_2 | developer | team | 9 | promoter | -| 2022-10-03 | 6 | nps_tool_1 | developer | enterprise | 7 | passive | - -The dbt Internal Analytics team ([long may they reign](https://www.linkedin.com/feed/update/urn:li:activity:6962884130569080833/)) took this data and decided to build the NPS Score metric into our repo so that it could be surfaced to stakeholders in multiple tools. This process is where we began to form our opinions on what should live in the modeling layer vs semantic layer - but these are sure to progress as we add in more and more real world use cases. - -### Option 1: Putting everything in the semantic layer - -If we wanted to store all the logic inside metric definitions, we could use the following code in the Semantic Layer section to create 6 different metrics that result in the NPS Score metric. This would allow end users to retrieve the NPS Score they are interested in a version-controlled, standard way across any of their BI tools of choice. Additionally, it allows users to individually slice/dice any of the component metrics by themselves. - -```yaml -metrics: - - name: total_respondents - label: Total of NPS Respondents - model: ref('customer_nps') - description: 'The count of users responding to NPS surveys in dbt Cloud.' - calculation_method: count - expression: unique_id - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - - - name: total_promoter_respondents - ......... ##same as total_respondents - filters: - - field: nps_category - operator: '=' - value: "'promoter'" - - - name: total_detractor_respondents - ......... ##same as total_respondents - filters: - - field: nps_category - operator: '=' - value: "'detractor'" - - - name: promoters_pct - label: Percent Promoters (Cloud) - description: 'The percent of dbt Cloud users in the promoters segment.' - calculation_method: expression - expression: "{{metric('total_promoter_respondents')}} / {{metric('total_respondents')}}" - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - - - name: detractor_pct - ... ##same as promoters_pct - expression: "{{metric('total_detractor_respondents')}} / {{metric('total_respondents')}}" - - - name: nps_score - label: Net Promoter Score - description: 'The NPS (-1 to 1) of all dbt Cloud users.' - calculation_method: expression - expression: "{{metric('promoters_pct')}} - {{metric('detractors_pct')}}" - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - -``` - -### Option 2: Keeping logic in the modeling layer - -But what if we didn’t want to encode all that information in the metric definitions? If we didn’t need the ability to dig into the component metrics and only wanted to look at the final score? In that case, we could encode most of the logic into the model itself and define the metric on top of that! - -Thinking through this, we know that our NPS Score is a series of ratios dependent on conditions of which category people fall into with the end result being a number between 100 to -100. That number is usually then *displayed* in a percentage format but it is *calculated* as a number. - -So in order to reduce the complexity of metric code, we can add a new field into the model that assigns an `nps_value` to each survey received. The logic for this field would assign a value of 100, 0, or -100 depending on the survey’s `nps_category`. Example code below: - -```sql -case - when nps_category = 'detractor' then -100 - when nps_category = 'promoter' then 100 - else 0 -end as nps_value -``` - -The end result of adding this code to the model would look something like this: - -| feedback_date | unique_id | feedback_source | user_type | account_plan | score | nps_category | nps_value | -| --- | --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | nps_tool_1 | developer | team | 5 | detractor | -100 | -| 2022-10-01 | 2 | nps_tool_2 | read_only | developer | 9 | promoter | 100 | -| 2022-10-02 | 3 | nps_tool_1 | developer | enterprise | 10 | promoter | 100 | -| 2022-10-02 | 4 | nps_tool_1 | developer | developer | 7 | passive | 0 | -| 2022-10-02 | 5 | nps_tool_2 | developer | team | 9 | promoter | 100 | -| 2022-10-03 | 6 | nps_tool_1 | developer | enterprise | 7 | passive | 0 | - -Now that each survey has an associated `nps_value` we can forgo the ratio calculations used in the Metric Logic section and create our NPS Score metric as a single average metric. - -```yaml -metrics: - - name: nps_score - label: NPS Score - model: ref('customer_nps') - calculation_method: average - expression: nps_value - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type -``` - -
- Why does this work? - -This is a slightly different way of calculating NPS from the usually provided formula but it ends up with the same result. Here is why: - -- `promoter_pct` was defined as `total_promoter_respondents` / `total_respondents` - - In our example dataset, this nets out to 3 / 6 = 50%. - - If we instead assign a value of 100 and take the average, it becomes 300 / 6 = 50. -- `detractor_pct` was defined as `total_detractor_respondents` / `total_respondents` - - In our example dataset, this nets out to 1 / 6 = 16.67%. - - If we instead assign a value of 100 and take the average, it becomes -100 / 6 = -16.67. -- Therefore, our `nps_score` follows suit: - - In our example dataset, 50% - 16.67% = 33.33% - - If we instead assign a value of 100 and take the average, it becomes 200 / 6 = 33.33 - -The underlying principle of why this works is based on the fact that averages divide the sum of the values in the set by their number. In more dbt friendly terms, what it really means is that average is creating the following equation: `sum(value)/count(*)`. In the first example implementation, we were doing roughly the same thing with multiple metric definitions - the only difference was our numerator was a count that assigned each row a value of 1. So if we duplicate that logic and give each row a value of 1 then we can create far fewer metrics. - -But that only gets us to the `promoter_pct` and `detractor_pct` metrics. In order to combine these both into a single metric definition, we needed to change the value that we assign. Given that the total range of values that the metric could output is -100 (all detractors) to 100 (all promoters) we can assign each of those categories that peak value, along with 0 for passives. This means that when the numerator is aggregated, it nets out promoters against detractors just like the documented equation does `promoter score - detractor score` . - -
- -**Is this what I should do?** - -[It depends!](https://twitter.com/SBinLondon/status/1413113782214266885) There will be times when it might be better to have logic stored in the modeling layer and there will be times when it might be better to have logic stored in the semantic layer. Our shorthand is to only include logic in the semantic layer if it is needed by our stakeholders - if they don't need to analyze the components, we keep them in the modeling layer. In the end, the needs of your business stakeholders should drive your decision on where to keep this logic. - -## How to structure your metrics - -Now that we’ve designed our metrics, let's move on to structuring them within our project. We'll examine the different ways to organize metrics and take a look at the pros and cons of several strategies. - -### Folder structure - -If you follow [dbt’s best practices for structuring your project](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), you will have a folder structure that looks similar to this: - -```yaml -models: - staging: - intermediate: - marts: -``` - -Your marts folder would most likely contain your end-state models ready for business consumption. Given that metrics are meant for business consumption, we are presented with two options - staying within the same framework or representing metrics as their own level. - -We recommend Option A (metrics within marts) but recognize that some people might prefer Option B (metrics within models). - -**A. Metrics within marts** - -Create a metrics folder within marts and use this to contain all of your metric definitions. - -```yaml -models: - staging: - intermediate: - marts: - metrics: -``` - -**B. Metrics within models** - -Create a metrics folder within models and use this to contain all of your metric definitions. - -```yaml -models: - staging: - intermediate: - marts: - metrics: -``` - -### File structure - -Once you’ve decided ***where*** to put your metrics folder, you can now decide ***how*** you want to structure your metrics within this folder. Choose one of two methods for structuring metrics: - -**Option A: The all-in-one YML method** -This method follows a similar pattern to [dbt’s best practices around model structure](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). The introduction of the metrics folder is the only change from the standard best practice. - -In practice, the all-in-one YML method would look like the following: - -```yaml -## Metrics within Marts -models: - marts: - metrics: - - metrics.yml ------- -## Metrics within Models -models: - metrics: - - metrics.yml -``` - -**Option B: The single-metric-per-file method** -In this method, you create *one* yml file for *each* metric*.* Although this is an opinionated stance that differs from [dbt’s best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), here are some reasons why this **could** be useful: - -- Individual files are more easily discovered by new analytics engineers as your organization expands -- Individual files can more easily define specific code owners that may not be part of the data team. - -For example, Jerry from the Finance department is the code owner for the `revenue` metric definition and oversees it for the business. So, any change to this specific file would need Jerry’s sign-off. - -This can be tricky for code owners who aren’t familiar with your git flow, but it brings them into the chain of responsibility for the metric definition. It also helps them take ownership for reporting on this metric and creates a responsible party when definitions need to change. - -The single-file-code-owner method would look like this: - -```yaml -models: - metrics: - marts: - - revenue.yml - - average_order_value.yml - - some_other_metric_name.yml -``` - -### Folder and file structure is a preference, not a hard rule - -In the end, all of the structuring information above is just a recommendation. Your project probably has a defined convention in how nodes are organized, whether or not it follows dbt best practices, and you should continue to follow your own organizational practices. That said, we do recommend that metrics should be separate from model yml files. The reason? - -**Metrics are important business objects unto themselves and should live separate from the model definitions.** - -## A call to action - -This is just the beginning of dbt metrics and the Semantic Layer. We have a number of exciting ideas for expanding capabilities that we plan to begin work on in the coming months. However, we can’t do that without you. - -This semantic layer is a fundamental change to what it means to interact with dbt and ultimately most of the best practices will come from the dbt Community - folks like you. It does not matter if you consider yourself an "expert" on this - we want to talk to you and hear how you are using or would like to use metrics and the semantic layer. Y’all are going to be our guiding light to help us make sure that all the functionality we add helps **you** serve the needs of your business. - -If your experience with the Semantic Layer match what we’ve written in this post, and especially if they don’t, please share [comments and feedback in this Discourse Discussion](https://discourse.getdbt.com/t/how-to-design-and-structure-metrics/5040)! - -Additionally, I would invite you to join us over at #dbt-core-metrics on the dbt Slack where we’ll be posting updates, answering questions, discussing usage, and hopefully responding with the best emojis. diff --git a/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md b/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md index 95988e75f04..bab92000a16 100644 --- a/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md +++ b/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md @@ -4,8 +4,7 @@ description: "Going polyglot is a major next step in the journey of dbt Core. It slug: polyglot-dbt-python-dataframes-sql authors: [doug_beatty] - -tags: [dbt product updates] +tags: [dbt tutorials] hide_table_of_contents: false date: 2022-10-18 diff --git a/website/blog/2023-04-26-deprecating-dbt-metrics.md b/website/blog/2023-04-26-deprecating-dbt-metrics.md index 1041f75eb2b..bf23bb992ad 100644 --- a/website/blog/2023-04-26-deprecating-dbt-metrics.md +++ b/website/blog/2023-04-26-deprecating-dbt-metrics.md @@ -5,7 +5,6 @@ slug: deprecating-dbt-metrics authors: [callum_mccann] -tags: [dbt product updates] hide_table_of_contents: false date: 2023-04-26 diff --git a/website/blog/2023-05-01-evolving-data-engineer-craft.md b/website/blog/2023-05-01-evolving-data-engineer-craft.md index 339d0ac380e..a3113240227 100644 --- a/website/blog/2023-05-01-evolving-data-engineer-craft.md +++ b/website/blog/2023-05-01-evolving-data-engineer-craft.md @@ -5,7 +5,6 @@ slug: evolving-data-engineer-craft authors: [sung_chung, kira_furuichi] -tags: [dbt product updates] hide_table_of_contents: false date: 2023-05-01 diff --git a/website/blog/2023-05-02-modeling-ragged-time-varying-hierarchies.md b/website/blog/2023-05-02-modeling-ragged-time-varying-hierarchies.md new file mode 100644 index 00000000000..d436df2628a --- /dev/null +++ b/website/blog/2023-05-02-modeling-ragged-time-varying-hierarchies.md @@ -0,0 +1,448 @@ +--- +title: "Modeling ragged time-varying hierarchies" +description: "Learn how to maximize the utility of complex hierarchical data in your analytics warehouse." +slug: modeling-ragged-time-varying-hierarchies + +authors: [sterling_paramore] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-05-02 +is_featured: true +--- + +This article covers an approach to handling time-varying ragged hierarchies in a dimensional model. These kinds of data structures are commonly found in manufacturing, where components of a product have both parents and children of arbitrary depth and those components may be replaced over the product's lifetime. The strategy described here simplifies many common types of analytical and reporting queries. + +To help visualize this data, we're going to pretend we are a company that manufactures and rents out eBikes in a ride share application. When we build a bike, we keep track of the serial numbers of the components that make up the bike. Any time something breaks and needs to be replaced, we track the old parts that were removed and the new parts that were installed. We also precisely track the mileage accumulated on each of our bikes. Our primary analytical goal is to be able to report on the expected lifetime of each component, so we can prioritize improving that component and reduce costly maintenance. + +## Data model + +Obviously, a real bike could have a hundred or more separate components. To keep things simple for this article, let's just consider the bike, the frame, a wheel, the wheel rim, tire, and tube. Our component hierarchy looks like: + + + +This hierarchy is *ragged* because different paths through the hierarchy terminate at different depths. It is *time-varying* because specific components can be added and removed. + +Now let's take a look at how this data is represented in our source data systems and how it can be transformed to make analytics queries easier. + +### Transactional model + +Our ERP system (Enterprise Resource Planning) contains records that log when a specific component serial number (`component_id`) was installed in or removed from a parent assembly component (`assembly_id`). The top-most assembly component is the eBike itself, which has no parent assembly. So when an eBike (specifically, the eBike with serial number "Bike-1") is originally constructed, the ERP system would contain records that look like the following. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| | Bike-1 | 2023-01-01 | | +| Bike-1 | Frame-1 | 2023-01-01 | | +| Bike-1 | Wheel-1 | 2023-01-01 | | +| Wheel-1 | Rim-1 | 2023-01-01 | | +| Wheel-1 | Tire-1 | 2023-01-01 | | +| Tire-1 | Tube-1 | 2023-01-01 | | + +Now let's suppose this bike has been ridden for a while, and on June 1, the user of the bike reported a flat tire. A service technician then went to the site, replaced the tube that was in the wheel, and installed a new one. They logged this in the ERP system, causing one record to be updated with a `removed_at` date, and another record to be created with the new tube `component_id`. + + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| ... | ... | ... | ... | +| Tire-1 | Tube-1 | 2023-01-01 | 2023-06-01 | +| Tire-1 | Tube-2 | 2023-06-01 | | +| ... | ... | ... | ... | + +After a few more months, there is a small crash. Don't worry, everyone's OK! However, the wheel (`Wheel-1`)is totally broken and must be replaced (with `Wheel-2`). When the technician updates the ERP, the entire hierarchy under the replaced wheel is also updated, as shown below. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| Bike-1 | Wheel-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Rim-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Tire-1 | 2023-01-01 | 2023-08-01 | +| Tire-1 | Tube-2 | 2023-06-01 | 2023-08-01 | # Note that this part has different install date +| Bike-1 | Wheel-2 | 2023-08-01 | | +| Wheel-2 | Rim-2 | 2023-08-01 | | +| Wheel-2 | Tire-2 | 2023-08-01 | | +| Tire-2 | Tube-3 | 2023-08-01 | | + + +After all of the above updates and additions, our ERP data looks like the following. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| | Bike-1 | 2023-01-01 | | +| Bike-1 | Frame-1 | 2023-01-01 | | +| Bike-1 | Wheel-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Rim-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Tire-1 | 2023-01-01 | 2023-08-01 | +| Tire-1 | Tube-1 | 2023-01-01 | 2023-06-01 | +| Tire-1 | Tube-2 | 2023-06-01 | 2023-08-01 | +| Bike-1 | Wheel-2 | 2023-08-01 | | +| Wheel-2 | Rim-2 | 2023-08-01 | | +| Wheel-2 | Tire-2 | 2023-08-01 | | +| Tire-2 | Tube-3 | 2023-08-01 | | + +So that's all fine and good from the perspective of the ERP system. But this data structure can be difficult to work with if we want to generate reports that calculate the total mileage accumulated on various components, or the average mileage of a particular component type, or how one component type might affect the lifetime of another component. + +### Multivalued dimensional model + +In dimensional modeling, we have *fact* tables that contain measurements and *dimension* tables that contain the context for those measurements (attributes). In our eBike data warehouse, we have a fact table that contains one record for each eBike for each day it is ridden and the measured mileage accumulated during rides that day. This fact table contains *surrogate key* columns, indicated by the `_sk` suffix. These are usually system-generated keys used to join to other tables in the database; the specific values of these keys are not important. + +**`fct_daily_mileage`:** + +| `bike_sk` | `component_sk` | `ride_at` | `miles` | +| - | - | - | - | +| bsk1 | csk1 | 2023-01-01 | 3 | +| bsk1 | csk1 | 2023-01-02 | 2 | +| bsk1 | csk1 | 2023-01-03 | 0 | +| bsk1 | csk1 | 2023-01-04 | 0 | +| ... | ... | ... | ... | +| bsk1 | csk3 | 2023-08-01 | 7 | +| bsk1 | csk3 | 2023-08-02 | 8 | +| bsk1 | csk3 | 2023-08-03 | 4 | + +One of the dimension tables is a simple table containing information about the individual bikes we have manufactured. + +**`dim_bikes`:** + +| `bike_sk` | `bike_id` | `color` | `model_name` | +| - | - | - | - | +| bsk1 | Bike-1 | Orange | Wyld Stallyn | + + +There is a simple many-to-one relationship between `fct_daily_mileage` and `dim_bikes`. If we need to calculate the total mileage accumulated for each bike in our entire fleet of eBikes, we just join the two tables and aggregate on the `miles` measurement. + +```sql +select + dim_bikes.bike_id, + sum(fct_daily_mileage.miles) as miles +from + fct_daily_mileage +inner join + dim_bikes + on + fct_daily_mileage.bike_sk = dim_bikes.bike_sk +group by + 1 +``` + +Extending this to determine if orange bikes get more use than red bikes or whether certain models are preferred are similarly straightforward queries. + +Dealing with all of the components is more complicated because there are many components installed on the same day. The relationship between days when the bikes are ridden and the components is thus *multivalued*. In `dim_bikes`, there is one record per bike and surrogate key. In our components dimension will have multiple records with the same surrogate key and will therefore be a *multivalued dimension*. Of course, to make things even more complicated, the components can change from day to day. To construct the multivalued dimension table, we break down the time-varying component hierarchy into distinct ranges of time where all of the components in a particular bike remain constant. At specific points in time where the components are changed, a new surrogate key is created. The final dimension table for our example above looks like the following, where the `valid_from_at` and `valid_to_at` represent the begin and end of a range of time where all the components of an eBike remain unchanged. + + +**`mdim_components`:** + +| `component_sk` | `assembly_id` | `component_id` | `depth` | `installed_at` | `removed_at` | `valid_from_at` | `valid_to_at` | +| - | - | - | - | - | - | - | - | +| csk1 | | Bike-1 | 0 | 2023-01-01 | | 2023-01-01 | 2023-06-01 | +| csk1 | Bike-1 | Frame-1 | 1 | 2023-01-01 | | 2023-01-01 | 2023-06-01 | +| csk1 | Bike-1 | Wheel-1 | 1 | 2023-01-01 | 2023-08-01 | 2023-01-01 | 2023-06-01 | +| csk1 | Wheel-1 | Rim-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-01-01 | 2023-06-01 | +| csk1 | Wheel-1 | Tire-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-01-01 | 2023-06-01 | +| csk1 | Tire-1 | Tube-1 | 3 | 2023-01-01 | 2023-06-01 | 2023-01-01 | 2023-06-01 | +| csk2 | | Bike-1 | 0 | 2023-01-01 | | 2023-06-01 | 2023-08-01 | +| csk2 | Bike-1 | Frame-1 | 1 | 2023-01-01 | | 2023-06-01 | 2023-08-01 | +| csk2 | Bike-1 | Wheel-1 | 1 | 2023-01-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk2 | Wheel-1 | Rim-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk2 | Wheel-1 | Tire-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk2 | Tire-1 | Tube-2 | 3 | 2023-06-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk3 | | Bike-1 | 0 | 2023-01-01 | | 2023-08-01 | | +| csk3 | Bike-1 | Frame-1 | 1 | 2023-01-01 | | 2023-08-01 | | +| csk3 | Bike-1 | Wheel-2 | 1 | 2023-08-01 | | 2023-08-01 | | +| csk3 | Wheel-2 | Rim-2 | 2 | 2023-08-01 | | 2023-08-01 | | +| csk3 | Wheel-2 | Tire-2 | 2 | 2023-08-01 | | 2023-08-01 | | +| csk3 | Tire-2 | Tube-3 | 3 | 2023-08-01 | | 2023-08-01 | | + +Now, let's look at how this structure can help in writing queries. In a later section of this article, we'll examine the SQL code that can take our ERP table and convert it into this dimensional model. + +### Mileage for a component + +Suppose we wanted to know the total mileage accumulated on "Wheel-1". The SQL code for determining this is very similar to that for determining the mileage for a given bike. + +```sql +select + mdim_components.component_id, + sum(fct_daily_mileage.miles) as miles +from + fct_daily_mileage +inner join + mdim_components + on + fct_daily_mileage.component_sk = mdim_components.component_sk +group by + 1 +where + component_id = 'Wheel-1' +``` + +:::caution + +One thing to be *very cautious* about when working with multivalued dimensions is that you need to be careful interpreting aggregations. For example, suppose we chose to aggregate on `top_assembly_id` (to reduce clutter, this field is not shown in the data model above because it is just "Bike-1" for each record). For this aggregation, we would be over-counting the total mileage on that top assembly because the join would result in a Cartesian product and thus we'd get a ["fan-out" situation](https://community.looker.com/technical-tips-tricks-1021/the-problem-of-sql-fanouts-30232). +::: + +### Bonus: Finding components installed at the same time as other components + +This structure simplifies other kinds of interesting analysis. Suppose we wanted to start exploring how one component affects another, like whether certain brands of tube needed to be replaced more often if they were in a new brand of tire. We can do this by partitioning the data into the segments of time where the components are not changing and looking for other components installed at the same time. For example, to find all of the components that were ever installed at the same time "Tube-3" was installed, we can collect them with a simple window function. We could then use the results of this query in a regression or other type of statistical analysis. + +```sql +select distinct + component_id +from + mdim_components +qualify + sum(iff(component_id = 'Tube-3', 1, 0)) over (partition by valid_from_at, valid_to_at) > 0 +``` + + +## SQL code to build the dimensional model + +Now we get to the fun part! This section shows how to take the ERP source data and turn it into the multivalued dimensional model. This SQL code was written and tested using Snowflake, but should be adaptable to other dialects. + +### Traversing the hierarchy + +The first step will be to traverse the hierarchy of components to find all components that belong to the same top assembly. In our example above, we only had one bike and thus just one top assembly; in a real system, there will be many (and we may even swap components between different top assemblies!). + +The key here is to use a [recursive join](https://docs.snowflake.com/en/sql-reference/constructs/with#recursive-clause) to move from the top of the hierarchy to all children and grandchildren. The top of the hierarchy is easy to identify because they are the only records without any parents. + +```sql +with recursive +-- Contains our source data with records that link a child to a parent +components as ( + select + *, + -- Valid dates start as installed/removed, but may be modified as we traverse the hierarchy below + installed_at as valid_from_at, + removed_at as valid_to_at + from + erp_components +), + +-- Get all the source records that are at the top of hierarchy +top_assemblies as ( + select * from components where assembly_id is null +), + +-- This is where the recursion happens that traverses the hierarchy +traversal as ( + -- Start at the top of hierarchy + select + -- Keep track of the depth as we traverse down + 0 as component_hierarchy_depth, + -- Flag to determine if we've entered a circular relationship + false as is_circular, + -- Define an array that will keep track of all of the ancestors of a component + [component_id] as component_trace, + -- At the top of the hierarchy, the component is the top assembly + component_id as top_assembly_id, + + assembly_id, + component_id, + + installed_at, + removed_at, + valid_from_at, + valid_to_at + from + top_assemblies + + union all + + -- Join the current layer of the hierarchy with the next layer down by linking + -- the current component id to the assembly id of the child + select + traversal.component_hierarchy_depth + 1 as component_hierarchy_depth, + -- Check for any circular dependencies + array_contains(components.component_id::variant, traversal.component_trace) as is_circular, + -- Append trace array + array_append(traversal.component_trace, components.component_id) as component_trace, + -- Keep track of the top of the assembly + traversal.top_assembly_id, + + components.assembly_id, + components.component_id, + + components.installed_at, + components.removed_at, + -- As we recurse down the hierarchy, only want to consider time ranges where both + -- parent and child are installed; so choose the latest "from" timestamp and the earliest "to". + greatest(traversal.valid_from_at, components.valid_from_at) as valid_from_at, + least(traversal.valid_to_at, components.valid_to_at) as valid_to_at + from + traversal + inner join + components + on + traversal.component_id = components.assembly_id + and + -- Exclude component assemblies that weren't installed at the same time + -- This may happen due to source data quality issues + ( + traversal.valid_from_at < components.valid_to_at + and + traversal.valid_to_at >= components.valid_from_at + ) + where + -- Stop if a circular hierarchy is detected + not array_contains(components.component_id::variant, traversal.component_trace) + -- There can be some bad data that might end up in hierarchies that are artificially extremely deep + and traversal.component_hierarchy_depth < 20 +), + +final as ( + -- Note that there may be duplicates at this point (thus "distinct"). + -- Duplicates can happen when a component's parent is moved from one grandparent to another. + -- At this point, we only traced the ancestry of a component, and fixed the valid/from dates + -- so that all child ranges are contained in parent ranges. + + select distinct * + from + traversal + where + -- Prevent zero-time (or less) associations from showing up + valid_from_at < valid_to_at +) + +select * from final +``` + +At the end of the above step, we have a table that looks very much like the `erp_components` that it used as the source, but with a few additional valuable columns: + +* `top_assembly_id` - This is the most important output of the hierarchy traversal. It ties all sub components to a their common parent. We'll use this in the next step to chop up the hierarchy into all the distinct ranges of time where the components that share a common top assembly are constant (and each distict range of time and `top_assembly_id` getting their own surrogate key). +* `component_hierarchy_depth` - Indicates how far removed a component is from the top assembly. +* `component_trace` - Contains an array of all the components linking this component to the top assembly. +* `valid_from_at`/`valid_to_at` - If you have really high-quality source data, these will be identical to `installed_at`/`removed_at`. However, in the real world, we've found cases where the installed and removal dates are not consistent between parent and child, either due to a data entry error or a technician forgetting to note when a component was removed. So for example, we may have a parent assembly that was removed along with all of its children, but only the parent assembly has `removed_at` populated. At this point, the `valid_from_at` and `valid_to_at` tidy up these kinds of scenarios. + +### Temporal range join + +The last step is perform a [temporal range join](https://discourse.getdbt.com/t/joining-snapshot-tables-time-range-based-joins/3226) between the top assembly and all of its descendents. This is what splits out all of the time-varying component changes into distinct ranges of time where the component hierarchy is constant. This range join makes use of [the dbt macro in this gist](https://gist.github.com/gnilrets/48886b4c8945dde1da13547c2373df73), the operation of which is out-of-scope for this article, but you are encouraged to investigate it and the discourse post mentioned earlier. + +```sql +-- Start with all of the assemblies at the top (hierarchy depth = 0) +with l0_assemblies as ( + select + top_assembly_id, + component_id, + -- Prep fields required for temporal range join + {{ dbt_utils.surrogate_key(['component_id', 'valid_from_at']) }} as dbt_scd_id, + valid_from_at as dbt_valid_from, + valid_to_at as dbt_valid_to + from + component_traversal + where + component_hierarchy_depth = 0 +), + +components as ( + select + top_assembly_id, + component_hierarchy_depth, + component_trace, + assembly_id, + component_id, + installed_at, + removed_at, + -- Prep fields required for temporal range join + {{ dbt_utils.surrogate_key(['component_trace', 'valid_from_at'])}} as dbt_scd_id, + valid_from_at as dbt_valid_from, + valid_to_at as dbt_valid_to + from + component_traversal +), + +-- Perform temporal range join +{{ + trange_join( + left_model='l0_assemblies', + left_fields=[ + 'top_assembly_id', + ], + left_primary_key='top_assembly_id', + right_models={ + 'components': { + 'fields': [ + 'component_hierarchy_depth', + 'component_trace', + 'assembly_id', + 'component_id', + 'installed_at', + 'removed_at', + ], + 'left_on': 'component_id', + 'right_on': 'top_assembly_id', + } + } + ) +}} + +select + surrogate_key, + top_assembly_id, + component_hierarchy_depth, + component_trace, + assembly_id, + component_id, + installed_at, + removed_at, + valid_from_at, + valid_to_at +from + trange_final +order by + top_assembly_id, + valid_from_at, + component_hierarchy_depth +``` + +## Bonus: component swap + +Before we go, let's investigate one other interesting scenario. Suppose we have two bikes, "Bike-1" and "Bike-2". While performing service, a technician notices that the color on the rim of "Bike-2" matches with the frame of "Bike-1" and vice-versa. Perhaps there was a mistake made during the initial assembly process? The technician decides to swap the wheels between the two bikes. The ERP system then shows that "Wheel-1" was removed from "Bike-1" on the service date and that "Wheel-1" was installed in "Bike-2" on the same date (similarly for "Wheel-2"). To reduce clutter below, we'll ignore Frames and Tubes. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| | Bike-1 | 2023-01-01 | | +| Bike-1 | Wheel-1 | 2023-01-01 | 2023-06-01 | +| Wheel-1 | Rim-1 | 2023-01-01 | | +| Wheel-1 | Tire-1 | 2023-01-01 | | +| | Bike-2 | 2023-02-01 | | +| Bike-2 | Wheel-2 | 2023-02-01 | 2023-06-01 | +| Wheel-2 | Rim-2 | 2023-02-01 | | +| Wheel-2 | Tire-2 | 2023-02-01 | | +| Bike-2 | Wheel-1 | 2023-06-01 | | +| Bike-1 | Wheel-2 | 2023-06-01 | | + +When this ERP data gets converted into the multivalued dimension, we get the table below. In the ERP data, only one kind of component assembly, the wheel, was removed/installed, but in the dimensional model all of the child components come along for the ride. In the table below, we see that "Bike-1" and "Bike-2" each have two distinct ranges of valid time, one prior to the wheel swap, and one after. + +**`mdim_components`:** + +| `component_sk` | `top_assembly_id` | `assembly_id` | `component_id` | `valid_from_at` | `valid_to_at` | +| - | - | - | - | - | - | +| sk1 | Bike-1 | | Bike-1 | 2023-01-01 | 2023-06-01 | +| sk1 | Bike-1 | Bike-1 | Wheel-1 | 2023-01-01 | 2023-06-01 | +| sk1 | Bike-1 | Wheel-1 | Rim-1 | 2023-01-01 | 2023-06-01 | +| sk1 | Bike-1 | Wheel-1 | Tire-1 | 2023-01-01 | 2023-06-01 | +| sk2 | Bike-1 | | Bike-1 | 2023-06-01 | | +| sk2 | Bike-1 | Bike-1 | Wheel-2 | 2023-06-01 | | +| sk2 | Bike-1 | Wheel-2 | Rim-2 | 2023-06-01 | | +| sk2 | Bike-1 | Wheel-2 | Tire-2 | 2023-06-01 | | +| sk3 | Bike-2 | | Bike-2 | 2023-02-01 | 2023-06-01 | +| sk3 | Bike-2 | Bike-2 | Wheel-2 | 2023-02-01 | 2023-06-01 | +| sk3 | Bike-2 | Wheel-2 | Rim-2 | 2023-02-01 | 2023-06-01 | +| sk3 | Bike-2 | Wheel-2 | Tire-2 | 2023-02-01 | 2023-06-01 | +| sk4 | Bike-2 | | Bike-2 | 2023-06-01 | | +| sk4 | Bike-2 | Bike-2 | Wheel-1 | 2023-06-01 | | +| sk4 | Bike-2 | Wheel-1 | Rim-1 | 2023-06-01 | | +| sk4 | Bike-2 | Wheel-1 | Tire-1 | 2023-06-01 | | + +## Summary + +In this article, we've explored a strategy for creating a dimensional model for ragged time-varying hierarchies. We used a simple toy system involving one or two eBikes. In the real world, there would be many more individual products, deeper hierarchies, more component attributes, and the install/removal dates would likely be captured with a timestamp component as well. The model described here works very well even in these messier real world cases. + +If you have any questions or comments, please reach out to me by commenting on this post or contacting me on dbt slack (@Sterling Paramore). diff --git a/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md b/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md index e1351034f66..2a4879ac98d 100644 --- a/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md +++ b/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md @@ -97,7 +97,7 @@ dbt Cloud includes **built-in Git** with accessible features directly from its I The biggest boon to Data Vault developer productivity in dbt Cloud are the **DataOps** and **Data Warehouse Automation** features of dbt Cloud. Each Data Vault developer gets their own development environment to work in and there is no complicated set up process to go through. -Commit your work, create a pull request, and have automated code review enabled by dbt Cloud [**jobs**](https://docs.getdbt.com/docs/deploy/dbt-cloud-job) that can be defined for each environment separately (e.g., testing, QA, production). Together with dbt [**tags**](https://docs.getdbt.com/reference/resource-configs/tags), the feature allows you to orchestrate your project in an efficient and powerful way. +Commit your work, create a pull request, and have automated code review enabled by dbt Cloud [**jobs**](https://docs.getdbt.com/docs/deploy/jobs) that can be defined for each environment separately (e.g., testing, QA, production). Together with dbt [**tags**](https://docs.getdbt.com/reference/resource-configs/tags), the feature allows you to orchestrate your project in an efficient and powerful way. ### Auditable data @@ -115,7 +115,9 @@ In terms of the implementation of the Data Vault itself, we recommend familiariz ### AutomateDV (formerly known as dbtvault) -AutomateDV is the most popular open source Data Vault package for dbt, with some users having over 5000 Data Vault components in their project. Here in Infinite Lambda, we’ve been using this package for quite some time now, even building on top of it (depending on the specifics of the project). This mature system provides a great way to start your Data Vault with dbt Cloud journey as the learning curve is quite manageable, it is well documented and even comes with tutorials and working examples built on top of Snowflake’s TPCH standard dataset. There is one limitation to using the package and that is _AutomateDV _expects your source data to contain only one delta load. In order to work around this issue, owners of the package came up with custom dbt materializations to help you with the initial load of your system, however, the performance of such load is in our experience not acceptable. +AutomateDV is the most popular open source Data Vault package for dbt, with some users having over 5000 Data Vault components in their project. Here in Infinite Lambda, we’ve been using this package for quite some time now, even building on top of it (depending on the specifics of the project). This mature system provides a great way to start your Data Vault with dbt Cloud journey as the learning curve is quite manageable, it is well documented and even comes with tutorials and working examples built on top of Snowflake’s TPCH standard dataset. There is one limitation to using the package and that is _AutomateDV_ expects your source data to contain only one delta load. In order to work around this issue, owners of the package came up with custom dbt materializations to help you with the initial load of your system, however, the performance of such load is in our experience not acceptable. + +_(Editor's note: As of AutomateDV v0.10.0, this performance issue has been resolved and users may use the standard incremental configuration.)_ ### datavault4dbt diff --git a/website/blog/2023-07-17-GPT-and-dbt-test.md b/website/blog/2023-07-17-GPT-and-dbt-test.md new file mode 100644 index 00000000000..84f756919a5 --- /dev/null +++ b/website/blog/2023-07-17-GPT-and-dbt-test.md @@ -0,0 +1,213 @@ +--- +title: "Create dbt Documentation and Tests 10x faster with ChatGPT" +description: "You can use ChatGPT to infer the context of verbosely named fields from database table schemas." +slug: create-dbt-documentation-10x-faster-with-ChatGPT + +authors: [pedro_brito_de_sa] + +tags: [analytics craft, data ecosystem] +hide_table_of_contents: true + +date: 2023-07-18 +is_featured: true +--- + +Whether you are creating your pipelines into dbt for the first time or just adding a new model once in a while, **good documentation and testing should always be a priority** for you and your team. Why do we avoid it like the plague then? Because it’s a hassle having to write down each individual field, its description in layman terms and figure out what tests should be performed to ensure the data is fine and dandy. How can we make this process faster and less painful? + +By now, everyone knows the wonders of the GPT models for code generation and pair programming so this shouldn’t come as a surprise. But **ChatGPT really shines** at inferring the context of verbosely named fields from database table schemas. So in this post I am going to help you 10x your documentation and testing speed by using ChatGPT to do most of the leg work for you. + + + +As a one-person Analytics team at [Sage](http://www.hellosage.com/) I had to create our dbt pipelines from the ground up. This meant 30+ tables of internal facts and dimensions + external data into a Staging Layer, plus all of the following layers of augmented models and Mart tables. After the fact, we are talking about 3500+ lines of YAML that I was NOT excited to get started on. Fortunately for me, this was February 2023 and ChatGPT had just come out. And boy, was I glad to have it. After a good dose of “prompt engineering” I managed to get most of my documentation and tests written out, only needing a few extra tweaks. + +Writing this article as of July 2023, and now powered by GPT-4 and not GPT 3.5, it is already easier to get the same results I did, so here are my learnings that I hope everyone can replicate. + +## Use verbose tables with verbose fields + +ChatGPT can only infer so much, so tables with names and fields that resemble encryption keys are unlikely to be good for this approach. In this example we are going to use this table: + +```sql +create or replace TRANSIENT TABLE STAGING.BASE.STG_STAFF_MEMBER ( + ID NUMBER(38,0), + CREATEDATETIME TIMESTAMP_NTZ(9), + UPDATEDATETIME TIMESTAMP_NTZ(9), + VERSION NUMBER(38,0), + FIRSTNAME VARCHAR(16777216), + JOBTITLE VARCHAR(16777216), + LASTNAME VARCHAR(16777216), + MIDDLENAME VARCHAR(16777216), + ISCAREADMIN BOOLEAN, + ISARCHIVED BOOLEAN, + ADDRESSID VARCHAR(16777216), + ENTERPRISEID VARCHAR(16777216), + ISDELETED BOOLEAN +); + +``` + +--- + +I copied this definition from Snowflake, and I would recommend that you always include both the name and type of the fields to get better results. This is especially helpful if you want to make sure ChatGPT correctly distinguishes between text, number, boolean or array fields, which may not be clear from the name alone. + +## Start out with simple prompts + +We all know how ChatGPT can digest very complex prompts, but as this is a tool for you to get the best results for your use case, I would recommend starting with simple, but specific, prompts. + +Opening ChatGPT with GPT4, my first prompt is usually along these lines: + + + +And the output of this simple prompt is nothing short of amazing: + +```yaml +version: 2 + +models: + - name: STAFF_MEMBER + description: This table contains information about the staff members. + columns: + - name: ID + description: The unique identifier for the staff member. + tests: + - unique + - not_null + - name: CREATEDATETIME + description: The timestamp when the record was created. + tests: + - not_null + - name: UPDATEDATETIME + description: The timestamp when the record was last updated. + - name: VERSION + description: Version number of the record. + - name: FIRSTNAME + description: The first name of the staff member. + tests: + - not_null + - name: JOBTITLE + description: The job title of the staff member. + tests: + - not_null + - name: LASTNAME + description: The last name of the staff member. + tests: + - not_null + - name: MIDDLENAME + description: The middle name of the staff member. + - name: ISCARADMIN + description: Boolean value indicating if the staff member is a care administrator. + tests: + - accepted_values: + values: ['true', 'false'] + - name: ISARCHIVED + description: Boolean value indicating if the staff member record is archived. + tests: + - accepted_values: + values: ['true', 'false'] + - name: COMMUNITYID + description: Identifier for the community of the staff member. + - name: ENTERPRISEID + description: Identifier for the enterprise of the staff member. + - name: ISDELETED + description: Boolean value indicating if the staff member record is deleted. + tests: + - accepted_values: + values: ['true', 'false'] +``` + +Back in my day (5 months ago), ChatGPT with GPT 3.5 didn’t have much context on what a dbt-style YAML file was, so the result was a funky, non-dbt YAML format. If you are using GPT 3.5 and the results you get from the first prompt don’t compile in dbt, I would recommend an extra step I took which is to pass your understanding of what a dbt-style YAML file is. You can do this simply by copy-pasting a piece of a publicly available YAML file like [this one from the Jaffle Shop example](https://github.com/dbt-labs/jaffle_shop/blob/main/models/schema.yml), or a piece of YAML from your own code base. + +A prompt for it would look something like: + + + +## Specify details on generic tests in your prompts + +dbt has four built-in generic tests: unique, not_null, accepted_values and relationships. My approach to adding details on these to the prompts is one of ‘more is better’, as in it’s faster to just delete tests that I don’t need in certain fields than having to add them manually. + +ChatGPT will throw unique and not_null tests for mostly all columns that have ‘id’ in their name, and in my experience it adds them to a few more columns than it should, which is usually quick to fix. + +Accepted_values and relationships are slightly trickier but the model can be adjusted to account for these in fairly simple ways. My tips are: + +- accepted_values: add extra detail to the schema like an extra keyword like ‘CATEGORICAL’ after the data type +- relationships: bind the relationships to fields with specific names. For better results, pass on a list of valid table names from your schema that it can refer to. + +One way of doing this is with a prompt like this: + + + +Which results in the following output: + +```yaml +version: 2 + +models: + - name: STAFF_MEMBER + description: This table contains information about the staff members. + columns: + - name: ID + description: The unique identifier for the staff member. + tests: + - unique + - not_null + - name: CREATEDATETIME + description: The timestamp when the record was created. + tests: + - not_null + - name: UPDATEDATETIME + description: The timestamp when the record was last updated. + - name: VERSION + description: Version number of the record. + - name: FIRSTNAME + description: The first name of the staff member. + tests: + - not_null + - name: JOBTITLE + description: The job title of the staff member. This is a categorical field. + tests: + - not_null + - accepted_values: + values: ['Job Title 1', 'Job Title 2', 'Job Title 3'] # replace these with actual job titles + - name: LASTNAME + description: The last name of the staff member. + tests: + - not_null + - name: MIDDLENAME + description: The middle name of the staff member. + - name: ISCARADMIN + description: Boolean value indicating if the staff member is a care administrator. + tests: + - accepted_values: + values: ['true', 'false'] + - name: ISARCHIVED + description: Boolean value indicating if the staff member record is archived. + tests: + - accepted_values: + values: ['true', 'false'] + - name: COMMUNITYID + description: Identifier for the community of the staff member. + tests: + - relationships: + to: STAGING.COMMUNITY.ID # replace with actual reference table + field: ID + - name: ENTERPRISEID + description: Identifier for the enterprise of the staff member. + tests: + - relationships: + to: STAGING.ENTERPRISE.ID # replace with actual reference table + field: ID + - name: ISDELETED + description: Boolean value indicating if the staff member record is deleted. + tests: + - accepted_values: + values: ['true', 'false'] +``` + +Even though it doesn’t magically guess the categories of your variables or your relationships, it is definitely helpful to have some placeholders in the right places. + +As an add-on, giving the model a short description of the data models and the tables you are working with will help it fine tune your results. + +## Wrap-Up + +Creating documentation is still a very manual job, and this approach only works for one table at a time (maybe you can be the one leveraging the OpenAI API and creating a webapp that processes multiple tables at once?). However, ChatGPT can clearly cut a lot of time in these tasks. + +I hope that these simple tips help you be more motivated and efficient in creating documentation and tests for your data models. And remember: verbosity in - verbosity out! diff --git a/website/blog/2023-08-01-announcing-materialized-views.md b/website/blog/2023-08-01-announcing-materialized-views.md new file mode 100644 index 00000000000..3917e3f192c --- /dev/null +++ b/website/blog/2023-08-01-announcing-materialized-views.md @@ -0,0 +1,213 @@ +--- +title: "Optimizing Materialized Views with dbt" +description: "In dbt v1.6, we introduce support for materialized views. In this blog post, Amy will review how to use them in your workflow" +slug: announcing-materialized-views + +authors: [amy_chen] + +tags: [analytics craft, dbt product updates, data ecosystem] +hide_table_of_contents: false + +date: 2023-08-03 +is_featured: true +--- + +## Introduction + +The year was 2020. I was a kitten-only household, and dbt Labs was still Fishtown Analytics. A enterprise customer I was working with, Jetblue, asked me for help running their dbt models every 2 minutes to meet a 5 minute SLA. + +After getting over the initial terror, we talked through the use case and soon realized there was a better option. Together with my team, I created [lambda views](https://discourse.getdbt.com/t/how-to-create-near-real-time-models-with-just-dbt-sql/1457%20?) to meet the need. + +Flash forward to 2023. I’m writing this as my giant dog snores next to me (don’t worry the cats have multiplied as well). Jetblue has outgrown lambda views due to performance constraints (a view can only be so performant) and we are at another milestone in dbt’s journey to support streaming. What. a. time. + +Today we are announcing that we now support Materialized Views in dbt. So, what does that mean? + + + +Materialized views are now an out of the box materialization in your dbt project once you upgrade to the latest version of dbt v1.6 on these following adapters: + +- dbt-postgres +- dbt-redshift +- dbt-snowflake +- dbt-databricks +- dbt-materialize* +- dbt-trino* +- dbt-bigquery** + +*These adapters have supported materialized views in their adapter prior 1.6. +**dbt-bigquery support will be coming in 1.7. + +Just like you would materialize your sql model as  `table` or `view`  today, you can use `materialized_view` in your model configuration, dbt_project.yml, and resources.yml files. At release, python models will not be supported. + + + +For Postgres/Redshift/Databricks + +```sql +{{ +config( + materialized = 'materialized_view', +) +}} + +``` + +For Snowflake: +```sql +{{ +config( + materialized = 'dynamic_table', +) +}} +``` + +:::note +We are only supporting dynamic tables on Snowflake, not Snowflake’s materialized views (for a comparison between Snowflake Dynamic Tables and Materialized Views, refer [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-comparison#dynamic-tables-compared-to-materialized-views). Dynamic tables are better suited for continuous transformations due to functionality like the ability to join, union, and aggregate on base tables, views , and other dynamic tables. Due to those features, they are also more aligned with what other data platforms are calling Materialized Views. For the sake of simplicity, when I refer to materialized views in this blog, I mean dynamic tables in Snowflake. +::: + +Now that we support materialized views: how do you fit them into your dbt workflow? It’s easy to imagine a world of unregulated computation because you didn’t put in proper guardrails and now you have materialized views running rampant unbeknownst to you in your data platform. + +Materialized views, just like any other materialization, fit a need and you should utilize them while taking into consideration the additional complexity they will add to your project. They are a tool in your analytics engineering toolbox, one of many. + +In this blog, we will go over when to pull this tool out of your toolbox, how to wield it successfully, and how to promote materialized views with governance in mind. Now this is a new functionality and I expect this to be the first of many posts to come, defining our best practices (or even redefining them). Also I will not be discussing dbt’s interactions upstream from the data platform like how to manage your Kafka topics using dbt, but would highly recommend [this post from Charlie Summers](https://docs.getdbt.com/blog/demystifying-event-streams) if that’s something you’re interested in. + +Additionally, if you want to get a more detailed understanding of your data platform’s support of materialized views, I recommend checking out dbt’s and your data platform’s documentation site. This blog post is intended to be a high level, platform agnostic overview to get you started. + +## What are Materialized Views? + +Starting out with, **what are materialized views (MVs)?** While specific features will vary by data platform, materialized views at their core are database objects that have stored the results of a query as a physically materialized table. What makes them distinct from a regular table is that the data in a materialized view is periodically refreshed to reflect the latest changes in the underlying table. Because they’re precomputed and the results are stored, you have faster query times when accessing them because you aren’t recomputing the data from scratch. This is great when you have low latency requirements for your data pipelines. + +Now you might have noticed that MVs sound a lot like incremental models, and you are not wrong! It can be worthwhile to think of materialized views as a successor of sorts to incremental models. In fact, depending on your needs and data platform of choice, you might wish to replace all of your incremental dbt models with materialized view models. By doing this, you will no longer have to manually craft specific incremental strategies, detailing how dbt should update the underlying table. Awesome, right? + +The tradeoff (outside of any data platform specific ones) is that you will have less fine-grained control over the incremental logic and orchestration. This is because you are handing defining the logic of what and how to update the existing table over to the data platform to perform for you. + +Other factors to consider when deciding on when/how to use a materialized view: +- What are the costs associated with running the materialized view versus a batched incremental model? (this will vary depending on your data platform as some will require different compute nodes) +- Does your data platform support joins, aggregations, and window functions on MVs if you need them? +- What are the latency needs of your development environment? In production? (If not near real time, you can make the choice between a batch incremental model or a MV with a longer refresh schedule.) +- How often do your upstream dependencies update? If your answer is `not frequent`, you may not need a MV. +- How large is your dataset?(It might be cheaper to use MVs for extremely large datasets) +- How often do you need your query refreshed? What are your downstream dependencies and their stakeholders? (If near real time is important, MVs might be the right choice). +- Do you have real time machine learning models training or applications using your transformed dataset? + +## Materialized Views in the dbt Workflow + +### Development + +When we talk about using materialized views in development, the question to think about is not so much “should you execute your dbt models as materialized views in your sandbox?,” but rather “should you schedule them to refresh in your sandbox?”. For development, you do need to create them and test them out in your sandbox but how do you do this in a way that doesn’t drive up your cloud bill unnecessarily? Or keeping a post-it note on your laptop as a reminder to drop all of the running materialized views in your sandbox before you sign off? Let’s talk about it! + +Outside of the scheduling part, development will be pretty standard. Your pipeline is likely going to look something like this: + + + +This is assuming you have a near real time pipeline where you are pulling from a streaming data source like a Kafka Topic via an ingestion tool of your choice like Snowpipe for Streaming into your data platform. After your data is in the data platform, you will: + +1. Create the dbt model with the SQL transformation logic that you need. +2. Look at the logic and answer these questions: + 1. Does my data platform support the functionality I need in materialized views? + 2. How often do you need the data refreshed? Do you need any flexibility in that? + 3. How am I promoting this into production? Either you will run the transformation logic in the production environment (recommended) and create a separate object or promote the object created from development. + + +Depending on your answer, this will decide if you want a materialized view in the first place (versus a view, table, or incremental model). If you have decided on a materialized view as meeting your needs, by default do not schedule a refresh. You can run manual refreshes as needed. Why’s that? In your development environment, you are likely validating three things: the dependencies, the SQL logic, and the transformation output. All of those can be tested by creating a materialized view without scheduling and running manually refreshes. + +Your configuration during development: + +For Postgres: + +Every time you run a `dbt run`, that will result in a manual refresh unless you set the `on_configuration_change` to `continue` which will skip running the model. + +```sql +{{ +config( + materialized = 'materialized_view', + on_configuration_change = 'apply', +) +}} +``` + +For Redshift: + +```sql +{{ +config( + materialized = 'materialized_view', + on_configuration_change = 'apply', + auto_refresh = False +) +}} +``` + +For Databricks: + +```sql +{{ +config( + materialized='materialized_view', +) +}} +``` + +By default, materialized views are not refreshed on a schedule on Databricks in this materialization. To set up scheduling, you can use a post-hook to alter the MV with a cron schedule that will run in Databricks Workflows. That could look like something like this + +```sql +post_hook = 'ALTER MATERIALIZED VIEW {{this}} ADD SCHEDULE CRON "0 0 0 * * ? *" AT TIME ZONE "America/Los_Angeles";' +``` + +For Snowflake: + +```sql +{{ +config( + materialized = 'dynamic_table', + snowflake_warehouse = '', + target_lag = '', + on_configuration_change = 'apply', +) +}} +``` + +Now if you do need to more fully build out your development pipeline (making sure scheduling/syncs do happen), you can schedule but make sure to drop the materialized views when you are done with them. I encourage you to invest in an operations macro that drops all MVs in the schema that you use as your sandbox and run it as needed. You could even create a dbt Cloud job to manage that. This way, you don’t have any stray MVs running in your sandbox, consuming credits unnecessarily. + +### Testing + +Now let’s dive into the second question: how do you test? In development and QA, this will look the same as any batch run tests. You can run `dbt build` or  `dbt test` and then have the tests run after execution as validation. But in production, what can you do to continually test? Your options are: + +- Continue to do batch testing as we wait for [materialized tests](https://github.com/dbt-labs/dbt-core/issues/6914) +- Or overriding the –store-failures macro like what Materialize has created [here](https://materialize.com/blog/real-time-data-quality-tests-using-dbt-and-materialize/) for their adapter to materialize failed rows as a materialized view. This is not a great solution for the long term but if you have urgency to put this into production, it is an option. + +In order to promote materialized views into production, the process will look very much like it did with your incremental models. Using SlimCI, for new MVs, you can build them into your QA environment. For existing MVs without changes, we can skip and defer to the production objects. + +### Production + +When you feel satisfied with your development and testing, for data platforms that offer scheduling via our dbt configurations, you have two options: hardcode the refresh cadence or write in conditional logic based on the environment for the refresh cadence. I recommend using the latter. + +The code for having a conditional in your config block looks like this if you want to include in a macro for either the lag or other fields (snowflake_warehouse, auto_refresh,etc): + +```sql +{% macro target_lag_environment() %} +{% set lag = '1 minute' if target.name == "prod" else '35 days' %} +{{ return(lag) }} +{% endmacro %} +``` + +```sql +{{ +config( + materialized = 'dynamic_table', + snowflake_warehouse = 'transforming', + target_lag = target_lag_environment(), + on_configuration_change = 'apply', +) +}} +``` + +You will want a very long lag for development; I recommend the cadence you drop and refresh your development environment. Here I just chose my two favorite numbers. + +For orchestration, if your materialized views aren’t able to auto refresh, you can use dbt cloud to orchestrate your refreshes. The beauty of materialized views is that dbt will be able to provide the dependency/testing/documentation but also skip or rerun the models as configured, enabling you to version control your logic. Reasonable guardrails for the modern data stack. ✨ + +Depending on how you orchestrate your materialized views, you can either run the testing in production as part of a scheduled job (with dbt test or dbt build). + +## Conclusion + +Well, I’m excited for everyone to remove the lines in your packages.yml that installed your experimental package (at least if you’re using it for MVs) and start to get your hands dirty. We are still new in our journey and I look forward to hearing all the things you are creating and how we can better our best practices in this. \ No newline at end of file diff --git a/website/blog/authors.yml b/website/blog/authors.yml index 72e747cc577..2e554ffc814 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,6 +1,6 @@ amy_chen: image_url: /img/blog/authors/achen.png - job_title: Senior Partner Engineer + job_title: Staff Partner Engineer links: - icon: fa-linkedin url: https://www.linkedin.com/in/yuanamychen/ @@ -186,7 +186,7 @@ emily_riederer: url: https://twitter.com/emilyriederer - icon: fa-readme url: https://emilyriederer.com - + grace_goheen: image_url: /img/blog/authors/grace-goheen.jpeg job_title: Analytics Engineer @@ -373,6 +373,16 @@ pat_kearns: name: Pat Kearns organization: dbt Labs +pedro_brito_de_sa: + image_url: /img/blog/authors/pedro_brito.jpeg + job_title: Product Analyst + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/pbritosa/ + name: Pedro Brito de Sa + organization: Sage + + rastislav_zdechovan: image_url: /img/blog/authors/rastislav-zdechovan.png job_title: Analytics Engineer @@ -451,6 +461,17 @@ simon_podhajsky: name: Simon Podhajsky organization: iLife Technologies +sterling_paramore: + description: | + Sterling Paramore started his career in theoretical and computation biophysics and learned that working with data was way more fun than being locked in the ivory tower. He loves solving data engineering and data analytics problems and has been a long time evangelist for dbt. + image_url: /img/blog/authors/sterling-paramore.png + job_title: Sr Staff Data Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/sterlingparamore/ + name: Sterling Paramore + organization: Mainspring Energy + sung_chung: image_url: /img/blog/authors/sung.jpeg job_title: Solutions Architect diff --git a/website/blog/categories.yml b/website/blog/categories.yml index 2a45e6529e2..8103f58cc33 100644 --- a/website/blog/categories.yml +++ b/website/blog/categories.yml @@ -15,10 +15,6 @@ display_title: dbt tutorials description: Best practices in the usage of our favorite data transformation tool. is_featured: true -- name: dbt updates - display_title: dbt product updates - description: An archive of monthly product updates from the dbt Labs team. - is_featured: true - name: SQL magic display_title: SQL magic description: Stories of dbt developers making SQL sing across warehouses. diff --git a/website/blog/2021-11-23-on-the-importance-of-naming.md b/website/blog/src.md similarity index 100% rename from website/blog/2021-11-23-on-the-importance-of-naming.md rename to website/blog/src.md diff --git a/website/dbt-versions.js b/website/dbt-versions.js index 01d1bf5d128..3eff99e7f98 100644 --- a/website/dbt-versions.js +++ b/website/dbt-versions.js @@ -1,8 +1,12 @@ exports.versions = [ + { + version: "1.7", + EOLDate: "2024-07-31", + isPrerelease: "true" + }, { version: "1.6", EOLDate: "2024-07-31", - isPrerelease: true }, { version: "1.5", @@ -20,17 +24,61 @@ exports.versions = [ version: "1.2", EOLDate: "2023-07-26", }, +] + +exports.versionedPages = [ { - version: "1.1", - EOLDate: "2023-04-28", + "page": "docs/build/build-metrics-intro", + "firstVersion": "1.6", }, { - version: "1.0", - EOLDate: "2022-12-03" + "page": "docs/build/sl-getting-started", + "firstVersion": "1.6", + }, + { + "page": "docs/build/about-metricflow", + "firstVersion": "1.6", + }, + { + "page": "docs/build/join-logic", + "firstVersion": "1.6", + }, + { + "page": "docs/build/validation", + "firstVersion": "1.6", + }, + { + "page": "docs/build/semantic-models", + "firstVersion": "1.6", + }, + { + "page": "docs/build/group-by", + "firstVersion": "1.6", + }, + { + "page": "docs/build/entities", + "firstVersion": "1.6", + }, + { + "page": "docs/build/metrics-overview", + "firstVersion": "1.6", + }, + { + "page": "docs/build/cumulative", + "firstVersion": "1.6", + }, + { + "page": "docs/build/derived", + "firstVersion": "1.6", + }, + { + "page": "docs/build/measure-proxy", + "firstVersion": "1.6", + }, + { + "page": "docs/build/ratio", + "firstVersion": "1.6", }, -] - -exports.versionedPages = [ { "page": "reference/commands/clone", "firstVersion": "1.6", @@ -122,71 +170,7 @@ exports.versionedPages = [ { "page": "reference/resource-configs/grants", "firstVersion": "1.2", - }, - { - "page": "docs/contributing/testing-a-new-adapter", - "firstVersion": "1.1", - }, - { - "page": "reference/dbt-jinja-functions/selected_resources", - "firstVersion": "1.1", - }, - { - "page": "reference/dbt-jinja-functions/print", - "firstVersion": "1.1", - }, - { - "page": "docs/build/build-metrics-intro", - "firstVersion": "1.6", - }, - { - "page": "docs/build/sl-getting-started", - "firstVersion": "1.6", - }, - { - "page": "docs/build/about-metricflow", - "firstVersion": "1.6", - }, - { - "page": "docs/build/join-logic", - "firstVersion": "1.6", - }, - { - "page": "docs/build/validation", - "firstVersion": "1.6", - }, - { - "page": "docs/build/semantic-models", - "firstVersion": "1.6", - }, - { - "page": "docs/build/group-by", - "firstVersion": "1.6", - }, - { - "page": "docs/build/entities", - "firstVersion": "1.6", - }, - { - "page": "docs/build/metrics-overview", - "firstVersion": "1.6", - }, - { - "page": "docs/build/cumulative", - "firstVersion": "1.6", - }, - { - "page": "docs/build/derived", - "firstVersion": "1.6", - }, - { - "page": "docs/build/measure-proxy", - "firstVersion": "1.6", - }, - { - "page": "docs/build/ratio", - "firstVersion": "1.6", - }, + } ] exports.versionedCategories = [ diff --git a/website/docs/community/resources/jobs-terms-and-conditions.md b/website/docs/community/resources/jobs-terms-and-conditions.md new file mode 100644 index 00000000000..f2f2134f847 --- /dev/null +++ b/website/docs/community/resources/jobs-terms-and-conditions.md @@ -0,0 +1,16 @@ +--- +title: "dbt Labs Community #jobs Channels Terms and Conditions" +id: "jobs-terms-and-conditions" +description: "Before posting a job in the dbt Community or submitting an application, review these terms and conditions." +--- + +I agree to abide by the [dbt Community Code of Conduct](community/resources/code-of-conduct) and all laws applicable to me in my use of the dbt Community's #jobs channels. I further agree: + +- dbt Labs is not responsible for not does it warrant or guarantee the validity, accuracy, completeness, legality, or reliability of any functionality of any #jobs channel, any posting's content, or any application and/or solicitation of any kind of employment. +- dbt Labs does not review and approve job-related content. +- dbt Labs disclaims liability of any kind whatsoever for any type of damage that occurs while using the community Slack for job-related reasons, and I waive any type of claim (including actual, special or consequential damages) to the maximum extent permitted by law. +- Without limitation, dbt Labs disclaims liability for quality, performance, merchantability, and fitness for a particular purpose, express or implied, that may arise out of my use of the community Slack for job-related content, my reliance on such information, and/or my provision/receipt of job-related information. +- I understand that no internet-based site is without risk, and my use is at my own risk. +- My use of any job-posting template (or other forum for providing job-related information) confirms my consent to provide the data posted, confirms that I have permission to post such data, and is subject to the terms of the [dbt Labs privacy policy](https://www.getdbt.com/cloud/privacy-policy). + +For further information, please contact [legal@dbtlabs.com](mailto:legal@dbtlabs.com). diff --git a/website/docs/community/resources/oss-expectations.md b/website/docs/community/resources/oss-expectations.md index 7bcc79cac9e..649a9dea94f 100644 --- a/website/docs/community/resources/oss-expectations.md +++ b/website/docs/community/resources/oss-expectations.md @@ -82,8 +82,8 @@ In some cases, the right resolution to an open issue might be tangential to the | `triage` | This is a new issue which has not yet been reviewed by a maintainer. This label is removed when a maintainer reviews and responds to the issue. | | `bug` | This issue represents a defect or regression from the behavior that's documented, or that you reasonably expect | | `enhancement` | This issue represents net-new functionality, including an extension of an existing capability | -| `good first issue` | This issue does not require deep knowledge of the codebase to implement. This issue is appropriate for a first-time contributor. | -| `help wanted` | This issue is trickier than a "good first issue." The required changes are scattered across the codebase, or more difficult to test. The maintainers are happy to help an experienced community contributor; they aren't planning to prioritize this issue themselves. | +| `good_first_issue` | This issue does not require deep knowledge of the codebase to implement. This issue is appropriate for a first-time contributor. | +| `help_wanted` | This issue is trickier than a "good first issue." The required changes are scattered across the codebase, or more difficult to test. The maintainers are happy to help an experienced community contributor; they aren't planning to prioritize this issue themselves. | | `duplicate` | This issue is functionally identical to another open issue. The maintainers will close this issue and encourage community members to focus conversation on the other one. | | `stale` | This is an old issue which has not recently been updated. In repositories with a lot of activity, stale issues will periodically be closed. | | `wontfix` | This issue does not require a code change in the repository, or the maintainers are unwilling to merge a change which implements the proposed behavior. | diff --git a/website/docs/community/spotlight/alan-cruickshank.md b/website/docs/community/spotlight/alan-cruickshank.md new file mode 100644 index 00000000000..74ef95a2b61 --- /dev/null +++ b/website/docs/community/spotlight/alan-cruickshank.md @@ -0,0 +1,43 @@ +--- +id: alan-cruickshank +title: Alan Cruickshank +description: | + I've been around in the dbt community, especially the London dbt Meetup, since early 2019—around the time that we started using dbt at tails.com. My background is the startup/scaleup space and building data teams in a context where there is a lot of growth going on but there isn't a lot of money around to support that. That's a topic that I've written and spoken about on several occasions on podcasts, blogposts and even at Coalesce 2020 and 2021! + + Aside from my work at tails.com, my other main focus at the moment is SQLFluff, the open source SQL linter which I started developing as part of a hackday at tails.com in late 2019 and now is the most starred SQL linter on Github with almost 1M downloads a month. +image: /img/community/spotlight/alan-cruickshank.jpg +pronouns: he/him +location: London, UK +jobTitle: Insights Director +companyName: tails.com +organization: Author & Maintainer of SQLFluff +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/amcruickshank/ + - name: SQLFluff + link: https://sqlfluff.com +dateCreated: 2023-06-30 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I [joined the community](https://www.getdbt.com/community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) in 2019 and it's been an invaluable source of advice and wisdom, especially operating on the bleeding edge of open source data tooling. It's been a place to meet like-minded people, even find new colleagues and certainly one of the places I look to when thinking about how to approach hairy data problems. + +In London it's also been one of the most vibrant meetup groups in person, compared to many others which are either very, very specialized or more focussed on larger organisations. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I just want to be useful 😁. I've learned a lot from the community over the years, and now I want to be able to give back to it. My primary vehicle for that is SQLFluff - both as a tool for the community to use, but also as a way of encouraging a wider group of people to feel welcome and able to contribute to open source software and build the tools of the future. + +I also see SQLFluff as a vehicle to drive more consistency in the way we write SQL, and through that drive better communication and lower the barrier for new people to enter this field and find their own success. + +## What have you learned from community members? What do you hope others can learn from you? + +For better or worse, I spend most of my day job on people and organisational things, less on how to solve individual problems, and more on how to enable and support groups of people in being able to make great decisions themselves. In some ways, if I have to touch the keyboard too much, it's a sign that I've failed in that calling. dbt itself is a tool which enables better collaboration—and the community is full of people with great ideas on how to better enable other people around us. I hope that I'm able to pass some of that knowledge and the experience of applying it in a scaleup environment back to others also treading this path. + +More specifically from the dbt community, if I were to pick one recommendation, it would be Emilie Schario’s talk from Coalesce 2022 on [“Data Led is Dumb”](https://www.youtube.com/watch?v=WsMHPALc8Vg&t=1s). I think should be essential watching for anyone who’s hearing “Data Led” a lot, and wants to turn that excitement into practical action. + +## Anything else interesting you want to tell us? + +If you're not using SQLFluff on your dbt project, you probably should be: https://github.com/sqlfluff/sqlfluff diff --git a/website/docs/community/spotlight/fabiyi-opeyemi.md b/website/docs/community/spotlight/fabiyi-opeyemi.md new file mode 100644 index 00000000000..f26ee27910b --- /dev/null +++ b/website/docs/community/spotlight/fabiyi-opeyemi.md @@ -0,0 +1,41 @@ +--- +id: fabiyi-opeyemi +title: Opeyemi Fabiyi +description: | + I'm an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. I've also got a background in financial services and supply chain. I'm passionate about helping organizations to become data-driven and I majorly use dbt for data modeling, while the other aspect of the stack is largely dependent on the client infrastructure I'm working for, so I often say I'm tool-agnostic. 😀 + + I'm the founder of Nigeria's Young Data Professional Community. I'm also the organizer of the Lagos dbt Meetup which I started, and one of the organizers of the DataFest Africa Conference. I became an active member of the dbt Community in 2021 & spoke at Coalesce 2022. +image: /img/community/spotlight/fabiyi-opeyemi.jpg +pronouns: he/him +location: Lagos, Nigeria +jobTitle: Senior Analytics Engineer +companyName: Data Culture +organization: Young Data Professionals (YDP) +socialLinks: + - name: Twitter + link: https://twitter.com/Opiano_1 + - name: LinkedIn + link: https://www.linkedin.com/in/opeyemifabiyi/ +dateCreated: 2023-07-02 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the [dbt Slack community](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) in 2021, and it has been an experience getting to learn from thought leaders in the space and stay in touch with cutting-edge innovation in the data space. The community has helped me become a better engineer by reading different responses to questions on Slack, and seeing genuine support from community members help other members tackle and solve their difficult problems is inspiring and has allowed me to model my community (YDP & the Lagos dbt Meetup) through that lens. I randomly enter the dbt Slack daily to read and learn from different channels. I love the sense of community that resonates in the dbt Slack channel, and the good news is that I got my current role from the #jobs channel from a post from Data Culture Co-Founder. So you can stay glued to that page if you are looking for a job role. + +The dbt community greatly impacted my previous role as a one-person data team. The community became the team I didn't have, providing all the necessary support and guidance I needed to deliver great value for the company excellently, and my experience with the community was the inspiration for my Coalesce talk in 2022 on how to leverage the dbt community as a data team of one. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Many great leaders inspire me in the dbt community; Joel Labes for constantly interacting with new folks and providing that safe space for everyone to ask any question, no matter how dumb you may think your question may be. He will give a response that will solve your problem; Benn Stancil for his vast experience and how he communicates it well with humour in his Friday night Substack, a newsletter I look forward to, which helps me stay current with recent trends in the global data space. + +Both of them resonate with the kind of leader I want to grow in the dbt Community; to be vast, experienced and readily available to provide support and guidance and help people solve problems and grow their careers. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned how to show empathy as a data professional and be a great engineer from various best practices around working with data. I also want others to know that irrespective of their current level of expertise or maturity in their data career, they can make an impact by getting involved in the community and helping others grow. + +## Anything else interesting you want to tell us? + +Maybe, I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I'm a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss😃 your community engagement efforts, please feel free to reach out. diff --git a/website/docs/community/spotlight/faith-lierheimer.md b/website/docs/community/spotlight/faith-lierheimer.md new file mode 100644 index 00000000000..3edb839bb1d --- /dev/null +++ b/website/docs/community/spotlight/faith-lierheimer.md @@ -0,0 +1,47 @@ +--- +id: faith-lierheimer +title: Faith Lierheimer +description: | + I've been a dbt Community member for around a year and a half. I come to the data world from teaching and academic research. Working in data fuses the aspects of those careers that I like the most, which are technical problem solving, and helping non-technical audiences understand data and what they can do with it. I have a dream stack with Databricks, dbt, and Looker. + + Professionally, I help shippers of perishable goods (everything from blueberries to childhood vaccinations) understand the risks their goods face in transit and how to mitigate them.This reduces food and medical waste worldwide. + + You can read more about these interests at faithfacts.substack.com. +image: /img/community/spotlight/faith-lierheimer.jpg +pronouns: she/her +location: Denver, CO, USA +jobTitle: Data Analyst II +companyName: Parsyl +organization: Data Angels +socialLinks: + - name: Twitter + link: https://twitter.com/FaithLierheimer + - name: LinkedIn + link: https://www.linkedin.com/in/faithlierheimer/ + - name: Substack + link: https://faithfacts.substack.com/ + - name: Data Folks + link: https://data-folks.masto.host/@faithlierheimer +dateCreated: 2023-06-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community early in 2022 hoping to find technical help with dbt, and instead found a wide support network of career-minded data professionals. Being in the dbt community has helped me find my niche in the data world, and has helped me discover ways I can grow my career and technical acumen. Being in this community has been huge in easing my career transition from teaching into data. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I primarily conceptualize of leadership as raising the floor beneath everyone, rather than enabling a few to touch its vaulted ceiling. As I gain more experience, I'd be delighted to be a resource for fellow career changers and teachers in transition. + +And, I love to goof in #roast-my-graph in the dbt Slack. [Come join](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) that channel, it's a hoot and a holler. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned a lot from community members, but most notably and concretely, I've actually gotten excellent visualization advice in #roast-my-graph. I've taken graphs there several times where I felt stuck on the presentation and have learned a lot about effective vizzes from my peers there. + +As I continue to gain experience, I hope others can learn from me what a successful career change looks like. And, ultimately, to take the work seriously but to not take ourselves that seriously. + +## Anything else interesting you want to tell us? + +I have a black cat with one eye named Gus and my purpose is now to give him the best existence possible. diff --git a/website/docs/community/spotlight/jing-yu-lim.md b/website/docs/community/spotlight/jing-yu-lim.md new file mode 100644 index 00000000000..a3d1784293f --- /dev/null +++ b/website/docs/community/spotlight/jing-yu-lim.md @@ -0,0 +1,41 @@ +--- +id: jing-yu-lim +title: Jing Yu Lim +description: | + For ~3 years, I was a Product Analyst at Grab, a ride-hailing and food delivery app in Southeast Asia, before taking on an Analytics Engineering role in Spenmo, a B2B Fintech startup. I joined a tech company as an analyst in June 2023, but was recently impacted by a layoff. I'm also one of the co-organisers of the Singapore dbt Meetup! + + My story with dbt started in Jan 2022, when I joined Spenmo where I taught myself dbt, mainly via dbt's documentation and Slack community. We used Snowflake as our data warehouse, and Holistics for BI. I spoke about data self-serve and Spenmo's journey with dbt at multiple meetups. +image: /img/community/spotlight/jing-lim.jpg +pronouns: she/her +location: Singapore, Singapore +jobTitle: I'm open to work! +companyName: "" +organization: "" +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/limjingyu/ +dateCreated: 2023-07-01 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community late January 2022, while setting up Spenmo's first dbt project. I was completely new to dbt, and relied heavily on the #advice-dbt-help channel in dbt Slack whenever I got stuck. I have learnt so much from reading discussions in other channels as well (e.g. #leading-data-teams, #advice-mock-interviews, #db-snowflake, #tools-holistics). + +The dbt community also helped me expand my professional network, where I met so many amazing individuals! It all started with #local-singapore which was created by community member Jolanda Zwagemaker sometime in April 2022. We organised dinners to connect with one another, which eventually led to an opportunity to run Singapore dbt Meetup (HUGE thank you to dbt) - it is heartwarming to see connections forged between many attendees of the meetup, where we also learn from one another. It really does feel like a community! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Claire Carroll and Mila Page! My very first touchpoint with dbt was their articles in [The Analytics Engineering Guide](https://www.getdbt.com/analytics-engineering/). I remember relating to it so much that I was saying "YES" to every other line I read, and sending text snippets to my friends. + +To me, Analytics Engineering could help overcome certain challenges I face as an analyst, and make the job feels less like a "hamster wheel." As the concept of analytics engineering is fairly new in Singapore, I feel the need to spread the word and bring about a mindset shift among not just data teams, but anyone who needs to work with a data team. + +## What have you learned from community members? What do you hope others can learn from you? + +One of my favourite presentations from the Singapore dbt Meetup was ["How would the ideal Semantic Layer look like?"](https://docs.google.com/presentation/d/1t1ts04b7qA-BVlV3qbNZ4fI-MSZn0iL6_FhsaWhJk_0/edit?usp=sharing ) by fellow community member Thanh Dinh from Holistics. It taught me a new perspective on metrics: they could be like dbt models, where dependencies can be set up between metric models. + +I definitely have so much more to learn as an individual, but I hope to share some of my tips and lessons in terms of data modelling with others. + +## Anything else interesting you want to tell us? + +Thank you dbt for enabling us to run meetups! It has been critical for ensuring a great experience for the Singapore community. Also a huge shoutout to Amada, the Global Community Development Lead, for always being super helpful and supportive despite the 12-hour time difference! diff --git a/website/docs/community/spotlight/josh-devlin.md b/website/docs/community/spotlight/josh-devlin.md new file mode 100644 index 00000000000..1a1db622209 --- /dev/null +++ b/website/docs/community/spotlight/josh-devlin.md @@ -0,0 +1,39 @@ +--- +id: josh-devlin +title: Josh Devlin +description: | + After "discovering" dbt in early 2020, I joined the community and used it as a learning tool while I tried to get dbt introduced at my company. By helping others, I learned about common pitfalls, best practices, and the breadth of the tool. When it came time to implement it months later, I already felt like an expert! + + In December 2020 I attended the first virtual Coalesce conference, attending all 4 days across 3 time zones! I found my quirky-nerdy-purple-people, and felt at home. + + 3 years later I had the pleasure of presenting at my first dbt Meetup in Sydney, and then at the first in-person Coalesce in New Orleans. My passion is helping people, and I'm glad that the dbt community gives me a place to do that! +image: /img/community/spotlight/josh-devlin.jpg +pronouns: he/him +location: Melbourne, Australia (but spent most of the last decade in Houston, USA) +jobTitle: Senior Analytics Engineer +companyName: Canva +organization: "" +socialLinks: + - name: Twitter + link: https://twitter.com/JayPeeDevlin + - name: LinkedIn + link: https://www.linkedin.com/in/josh-devlin/ +dateCreated: 2023-06-27 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I have been a subscriber to 'The Data Science Roundup' (now ['The Analytics Engineering Roundup'](https://roundup.getdbt.com/)) since its inception, so I knew that dbt existed from the very beginning, since the time that dbt Labs was still called Fishtown Analytics. Despite that, I never really understood what the tool was or how it fit in until early 2020 when I first started experimenting with the tool. I immediately joined the community and found it warm and welcoming, so I started to help people where I could and never stopped! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I like to think I represent the warm, helpful vibes of the early days of the Community, where folks like Claire Carroll warmly welcomed myself and others! + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned that the more you give, the more you get. I've put hundreds of hours into helping other people in the community, but I've gotten all that back and much more. I hope I can encourage others to give of themselves and reap the rewards later! + +## Anything else interesting you want to tell us? + +In a previous life I was an orchestral musician! diff --git a/website/docs/community/spotlight/owen-prough.md b/website/docs/community/spotlight/owen-prough.md new file mode 100644 index 00000000000..cc8ce37221e --- /dev/null +++ b/website/docs/community/spotlight/owen-prough.md @@ -0,0 +1,41 @@ +--- +id: owen-prough +title: Owen Prough +description: | + Well met, data adventurer! My professional data history is mostly USA healthcare-related (shout out to ANSI X12 claim files) while working with large (10k+ employee) software companies and small (but growing!) startups. My constant companion for the last decade has been SQL of various flavors https://xkcd.com/927/, and these days I mostly work with PostgreSQL, AWS Athena, and Snowflake. I think SQL is a great tool to solve interesting problems. + + Oh and also dbt. I haven't done anything too fancy with dbt, but I have contributed to the dbt-athena adapter and a few different packages. Mostly I lurk on Slack, cleverly disguised as a duck. It's a professional goal of mine to someday attend Coalesce. +image: /img/community/spotlight/owen-prough.jpg +pronouns: he/him +location: Milwaukee, USA +jobTitle: Data Engineer +companyName: Sift Healthcare +organization: "" +socialLinks: + - name: LinkedIn + link: https://linkedin.com/in/owen-prough +dateCreated: 2023-06-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I committed dbt_project.yml to the company git repo in July 2021 so I've been hanging out with all of you for about 2 years. What I love the most about dbt is how easy it is to write data tests. Writing data tests without dbt was painful, but now with all the tests we have in dbt I have a dramatically improved confidence in our data quality. + +The wider dbt community is also a reliable and constant source of education. I only interact in a few Slack channels, but I read *many* Slack channels to see what others are doing in the Analytics Engineering space and to get ideas about how to improve the processes/pipelines at my company. Y'all are great. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +This is an interesting question. I think I most identify with or am inspired by [Josh Devlin](./josh-devlin), who seems to be everywhere on Slack and very knowledgeable/helpful. I also want to know things and pay it forward. + +Also shout out to [Faith Lierheimer](./faith-lierheimer), whose contributions to [#roast-my-graph](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) always make me laugh and/or weep. + +## What have you learned from community members? What do you hope others can learn from you? + +The [public documentation for dbt](https://docs.getdbt.com/docs/introduction) is quite good. You should bookmark it and make it a personal goal to read through it all. There are a lot of cool things that dbt can do. + +Also I think it's really cool to see newcomers asking questions on Slack/[Discourse](https://discourse.getdbt.com/) and then see those same people answering others' questions. It speaks to the value we all get from dbt that folks want to give back to the community. + +## Anything else interesting you want to tell us? + +Did you notice how I avoided starting a sentence with "dbt"? That's because I know the standard is lowercase, but starting a sentence with a lowercase word looks weird to my eyes. diff --git a/website/docs/docs/build/about-metricflow.md b/website/docs/docs/build/about-metricflow.md index 5a42fcd7b3e..68879911597 100644 --- a/website/docs/docs/build/about-metricflow.md +++ b/website/docs/docs/build/about-metricflow.md @@ -10,9 +10,9 @@ This guide introduces MetricFlow's fundamental ideas for new users. MetricFlow, :::info -MetricFlow is a new way to define metrics in dbt and one of the key components of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer). It handles SQL query construction and defines the specification for dbt semantic models and metrics. +MetricFlow is a new way to define metrics and one of the key components of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). It handles SQL query construction and defines the specification for dbt semantic models and metrics. -To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). +MetricFlow is currently available on dbt v1.6 or higher for all users. dbt Core users can use the MetricFlow CLI to define metrics in their local dbt Core project. However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. ::: @@ -33,7 +33,9 @@ There are a few key principles: - MetricFlow, as a part of the dbt Semantic Layer, allows organizations to define company metrics logic through YAML abstractions, as described in the following sections. -- You can install MetricFlow via PyPI as an extension of your [dbt adapter](/docs/supported-data-platforms) in the CLI. To install the adapter, run `pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `pip install "dbt-metricflow[snowflake]"`. +- You can install MetricFlow using PyPI as an extension of your [dbt adapter](/docs/supported-data-platforms) in the CLI. To install the adapter, run `pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `pip install "dbt-metricflow[snowflake]"`. + +- To query metrics dimensions, dimension values, and validate your configurations; install the [MetricFlow CLI](/docs/build/metricflow-cli). ### Semantic graph @@ -53,7 +55,6 @@ For a semantic model, there are three main pieces of metadata: * [Dimensions](/docs/build/dimensions) — These are the ways you want to group or slice/dice your metrics. * [Measures](/docs/build/measures) — The aggregation functions that give you a numeric result and can be used to create your metrics. - ### Metrics Metrics, which is a key concept, are functions that combine measures, constraints, or other mathematical functions to define new quantitative indicators. MetricFlow uses measures and various aggregation types, such as average, sum, and count distinct, to create metrics. Dimensions add context to metrics and without them, a metric is simply a number for all time. You can define metrics in the same YAML files as your semantic models, or create a new file. @@ -64,27 +65,24 @@ MetricFlow supports different metric types: - [Derived](/docs/build/derived) — An expression of other metrics, which allows you to do calculations on top of metrics. - [Ratio](/docs/build/ratio) — Create a ratio out of two measures, like revenue per customer. - [Simple](/docs/build/simple) — Metrics that refer directly to one measure. + ## Use case In the upcoming sections, we'll show how data practitioners currently calculate metrics and compare it to how MetricFlow makes defining metrics easier and more flexible. -The following example data schema image shows a number of different types of data tables: +The following example data is based on the Jaffle Shop repo. You can view the complete [dbt project](https://github.com/dbt-labs/jaffle-sl-template). The tables we're using in our example model are: -- `transactions` is a production data platform export that has been cleaned up and organized for analytical consumption -- `visits` is a raw event log -- `stores` is a cleaned-up and fully normalized dimensional table from a daily production database export -- `products` is a dimensional table that came from an external source such as a wholesale vendor of the goods this store sells. -- `customers` is a partially denormalized table in this case with a column derived from the transactions table through some upstream process +- `orders` is a production data platform export that has been cleaned up and organized for analytical consumption +- `customers` is a partially denormalized table in this case with a column derived from the orders table through some upstream process -![MetricFlow-SchemaExample](/img/docs/building-a-dbt-project/MetricFlow-SchemaExample.jpeg) + -To make this more concrete, consider the metric `revenue`, which is defined using the SQL expression: +To make this more concrete, consider the metric `order_total`, which is defined using the SQL expression: -`select sum(price * quantity) as revenue from transactions` - -This expression calculates the total revenue by multiplying the price and quantity for each transaction and then adding up all the results. In business settings, the metric `revenue` is often calculated according to different categories, such as: -- Time, for example `date_trunc(created_at, 'day')` -- Product, using `product_category` from the `product` table. +`select sum(order_total) as order_total from orders` +This expression calculates the revenue from each order by summing the order_total column in the orders table. In a business setting, the metric order_total is often calculated according to different categories, such as" +- Time, for example `date_trunc(ordered_at, 'day')` +- Order Type, using `is_food_order` dimension from the `orders` table. ### Calculate metrics @@ -93,21 +91,21 @@ Next, we'll compare how data practitioners currently calculate metrics with mult -The following example displays how data practitioners typically would calculate the revenue metric aggregated. It's also likely that analysts are asked for more details on a metric, like how much revenue came from bulk purchases. +The following example displays how data practitioners typically would calculate the `order_total` metric aggregated. It's also likely that analysts are asked for more details on a metric, like how much revenue came from new customers. Using the following query creates a situation where multiple analysts working on the same data, each using their own query method — this can lead to confusion, inconsistencies, and a headache for data management. ```sql select - date_trunc(transactions.created_at, 'day') as day - , products.category as product_category - , sum(transactions.price * transactions.quantity) as revenue + date_trunc('day',orders.ordered_at) as day, + case when customers.first_ordered_at is not null then true else false end as is_new_customer, + sum(orders.order_total) as order_total from - transactions + orders left join - products + customers on - transactions.product_id = products.product_id + orders.customer_id = customers.customer_id group by 1, 2 ``` @@ -116,126 +114,127 @@ group by 1, 2 > Introducing MetricFlow, a key component of the dbt Semantic Layer 🤩 - simplifying data collaboration and governance. -In the following three example tabs, use MetricFlow to define a semantic model that uses revenue as a metric and a sample schema to create consistent and accurate results — eliminating confusion, code duplication, and streamlining your workflow. +In the following three example tabs, use MetricFlow to define a semantic model that uses order_total as a metric and a sample schema to create consistent and accurate results — eliminating confusion, code duplication, and streamlining your workflow. -In this example, a measure named revenue is defined based on two columns in the `schema.transactions` table. The time dimension `ds` provides daily granularity and can be aggregated to weekly or monthly time periods. Additionally, a categorical dimension called `is_bulk_transaction` is specified using a case statement to capture bulk purchases. +In this example, a measure named `order_total` is defined based on the order_total column in the `orders` table. + +The time dimension `metric_time` provides daily granularity and can be aggregated into weekly or monthly time periods. Additionally, a categorical dimension called `is_new_customer` is specified in the `customers` semantic model. ```yaml semantic_models: - - name: transactions - description: "A record for every transaction that takes place. Carts are considered multiple transactions for each SKU." - owners: support@getdbt.com - model: (ref('transactions')) + - name: orders # The name of the semantic model + description: | + A model containing order data. The grain of the table is the order id. + model: ref('orders') #The name of the dbt model and schema defaults: - agg_time_dimension: metric_time - - # --- entities --- - entities: - - name: transaction_id + agg_time_dimension: metric_time + entities: # Entities, which usually correspond to keys in the table. + - name: order_id type: primary - - name: customer_id - type: foreign - - name: store_id + - name: customer type: foreign - - name: product_id - type: foreign - - # --- measures --- - measures: - - name: revenue - description: - expr: price * quantity - agg: sum - - name: quantity - description: Quantity of products sold - expr: quantity - agg: sum - - name: active_customers - description: A count of distinct customers completing transactions expr: customer_id - agg: count_distinct - - # --- dimensions --- - dimensions: + measures: # Measures, which are the aggregations on the columns in the table. + - name: order_total + agg: sum + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. - name: metric_time + expr: cast(ordered_at as date) type: time - expr: date_trunc('day', ts) type_params: time_granularity: day - - name: is_bulk_transaction + - name: customers # The name of the second semantic model + description: > + Customer dimension table. The grain of the table is one row per + customer. + model: ref('customers') #The name of the dbt model and schema + defaults: + agg_time_dimension: first_ordered_at + entities: # Entities, which usually correspond to keys in the table. + - name: customer + type: primary + expr: customer_id + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: is_new_customer type: categorical - expr: case when quantity > 10 then true else false end + expr: case when first_ordered_at is not null then true else false end + - name: first_ordered_at + type: time + type_params: + time_granularity: day + ``` - - -Similarly, you could then add a `products` semantic model on top of the `products` model to incorporate even more dimensions to slice and dice your revenue metric. + -Notice the identifiers present in the semantic models `products` and `transactions`. MetricFlow does the heavy-lifting for you by traversing the appropriate join keys to identify the available dimensions to slice and dice your `revenue` metric. +Similarly, you could then add additional dimensions like `is_food_order` to your semantic models to incorporate even more dimensions to slice and dice your revenue order_total. ```yaml semantic_models: - - name: products - description: A record for every product available through our retail stores. - owners: support@getdbt.com - model: ref('products') - - # --- identifiers --- - entities: - - name: product_id + - name: orders + description: | + A model containing order data. The grain of the table is the order id. + model: ref('orders') #The name of the dbt model and schema + defaults: + agg_time_dimension: metric_time + entities: # Entities, which usually correspond to keys in the table + - name: order_id type: primary - - # --- dimensions --- - dimensions: - - name: category - type: categorical - - name: brand - type: categorical - - name: is_perishable + - name: customer + type: foreign + expr: customer_id + measures: # Measures, which are the aggregations on the columns in the table. + - name: order_total + agg: sum + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: metric_time + expr: cast(ordered_at as date) + type: time + type_params: + time_granularity: day + - name: is_food_order type: categorical - expr: | - category in ("vegetables", "fruits", "dairy", "deli") ``` -Imagine an even more difficult metric is needed, like the amount of money earned each day by selling perishable goods per active customer. Without MetricFlow the data practitioner's original SQL might look like this: +Imagine an even more complex metric is needed, like the amount of money earned each day from food orders from returning customers. Without MetricFlow the data practitioner's original SQL might look like this: ```sql select - date_trunc(transactions.created_at, 'day') as day - , products.category as product_category - , sum(transactions.price * transactions.quantity) as revenue - , count(distinct customer_id) as active_customers - , sum(transactions.price * transactions.quantity)/count(distinct customer_id) as perishable_revenues_per_active_customer + date_trunc('day',orders.ordered_at) as day, + sum(case when is_food_order = true then order_total else null end) as food_order, + sum(orders.order_total) as sum_order_total, + food_order/sum_order_total from - transactions + orders left join - products + customers on - transactions.product_id = products.product_id -where - products.category in ("vegetables", "fruits", "dairy", "deli") -group by 1, 2 + orders.customer_id = customers.customer_id +where + case when customers.first_ordered_at is not null then true else false end = true +group by 1 ``` MetricFlow simplifies the SQL process via metric YAML configurations as seen below. You can also commit them to your git repository to ensure everyone on the data and business teams can see and approve them as the true and only source of information. ```yaml metrics: - - name: perishables_revenue_per_active_customer - description: Revenue from perishable goods (vegetables, fruits, dairy, deli) for each active store. + - name: food_order_pct_of_order_total + description: Revenue from food orders in each store + label: "Food % of Order Total" type: ratio type_params: - numerator: revenue + numerator: food_order denominator: active_customers filter: | - {{dimension('perishable_goods')}} in ('vegetables',' fruits', 'dairy', 'deli') + {{ Dimension('customer__is_new_customer')}} = true ``` @@ -266,7 +265,7 @@ metrics:
How does the Semantic Layer handle joins?
-
MetricFlow builds joins based on the types of keys and parameters that are passed to entities. To better understand how joins are constructed see our documentations on join types.

Rather than capturing arbitrary join logic, MetricFlow captures the types of each identifier and then helps the user to navigate to appropriate joins. This allows us to avoid the construction of fan out and chasm joins as well as generate legible SQL.
+
MetricFlow builds joins based on the types of keys and parameters that are passed to entities. To better understand how joins are constructed see our documentation on join types.

Rather than capturing arbitrary join logic, MetricFlow captures the types of each identifier and then helps the user to navigate to appropriate joins. This allows us to avoid the construction of fan out and chasm joins as well as generate legible SQL.
diff --git a/website/docs/docs/build/build-metrics-intro.md b/website/docs/docs/build/build-metrics-intro.md index e98ee013d0b..a6fab61d576 100644 --- a/website/docs/docs/build/build-metrics-intro.md +++ b/website/docs/docs/build/build-metrics-intro.md @@ -7,49 +7,61 @@ tags: [Metrics, Semantic Layer, Governance] hide_table_of_contents: true --- -Use MetricFlow in dbt to centrally define your metrics. MetricFlow is a key component of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer) and is responsible for SQL query construction and defining specifications for dbt semantic models and metrics. +Use MetricFlow in dbt to centrally define your metrics. As a key component of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), MetricFlow is responsible for SQL query construction and defining specifications for dbt semantic models and metrics. Use familiar constructs like semantic models and metrics to avoid duplicative coding, optimize your development workflow, ensure data governance for company metrics, and guarantee consistency for data consumers. :::info -MetricFlow is currently available on dbt Core v1.6 beta for [command line (CLI)](/docs/core/about-the-cli) users, with support for dbt Cloud and integrations coming soon. MetricFlow, a BSL package (code is source available), is a new way to define metrics in dbt and will replace the dbt_metrics package. +MetricFlow is currently available on dbt v1.6 or higher and allows users to define metrics in their dbt project whether in dbt Cloud or dbt Core. dbt Core users can use the MetricFlow CLI to define metrics in their local dbt Core project. However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. -To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). ::: -Before you start, keep the following considerations in mind: -- Use the CLI to define metrics in YAML and query them using the [new metric specifications](https://github.com/dbt-labs/dbt-core/discussions/7456). -- You must be on dbt Core v1.6 beta or higher to use MetricFlow. [Upgrade your dbt version](/docs/core/pip-install#change-dbt-core-versions) to get started. - * Note: Support for dbt Cloud and querying via external integrations coming soon. -- MetricFlow currently only supports Snowflake and Postgres. - * Note: Support for BigQuery, Databricks, and Redshift coming soon. -- dbt Labs is working with [integration partners](https://www.getdbt.com/product/semantic-layer-integrations) to develop updated integrations for the new Semantic Layer, powered by MetricFlow, in addition to introducing other consumption methods like Python and JDBC.

+Before you start, consider the following guidelines: -
+- Define metrics in YAML and query them using these [new metric specifications](https://github.com/dbt-labs/dbt-core/discussions/7456). +- You must be on dbt v1.6 or higher to use MetricFlow. [Upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to get started. +- Use MetricFlow with Snowflake, BigQuery, Databricks, Postgres (CLI only), or Redshift. (dbt Cloud Postgres support coming soon) +- Unlock insights and query your metrics using the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and its diverse range of [available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations). + + +
- - + icon="dbt-bit"/> + icon="dbt-bit"/> + icon="dbt-bit"/> + + + + + + +

diff --git a/website/docs/docs/build/cumulative-metrics.md b/website/docs/docs/build/cumulative-metrics.md index efdde600635..3104fd7578a 100644 --- a/website/docs/docs/build/cumulative-metrics.md +++ b/website/docs/docs/build/cumulative-metrics.md @@ -6,26 +6,70 @@ sidebar_label: Cumulative tags: [Metrics, Semantic Layer] --- -Cumulative metrics aggregate a measure over a given window. If no window is specified, the window is considered infinite and accumulates values over all time. +Cumulative metrics aggregate a measure over a given accumulation window. If no window is specified, the window is considered infinite and accumulates values over all time. You will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. -:::info MetricFlow time spine required +This metric is common for calculating things like weekly active users, or month-to-date revenue. The parameters, description, and type for cumulative metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `measure` | The measure you are referencing. | Required | +| `window` | The accumulation window, such as 1 month, 7 days, 1 year. This can't be used with `grain_to_date`. | Optional | +| `grain_to_date` | Sets the accumulation grain, such as month will accumulate data for one month. Then restart at the beginning of the next. This can't be used with `window`. | Optional | + +The following displays the complete specification for cumulative metrics, along with an example: + +```yaml +metrics: + - name: The metric name # Required + description: The metric description # Optional + type: cumulative # Required + label: The value that will be displayed in downstream tools # Required + type_params: # Required + measure: The measure you are referencing # Required + window: The accumulation window, such as 1 month, 7 days, 1 year. # Optional. Can not be used with window. + grain_to_date: Sets the accumulation grain, such as month will accumulate data for one month, then restart at the beginning of the next. # Optional. Cannot be used with grain_to_date + +``` + +## Cumulative metrics example + + +:::tip MetricFlow time spine required You will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. ::: +Cumulative metrics measure data over a given window and consider the window infinite when no window parameter is passed, accumulating the data over all time. + ```yaml -# Cumulative metrics aggregate a measure over a given window. The window is considered infinite if no window parameter is passed (accumulate the measure over all time) + metrics: -- name: wau_rolling_7 - owners: - - support@getdbt.com - type: cumulative - type_params: - measures: - - distinct_users - #Omitting window will accumulate the measure over all time - window: 7 days + - name: cumulative_order_total + label: Cumulative Order total (All Time) + description: The cumulative value of all orders + type: cumulative + type_params: + measure: order_total + - name: cumulative_order_total_l1m + label: Cumulative Order total (L1M) + description: Trailing 1 month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month + - name: cumulative_order_total_mtd + label: Cumulative Order total (MTD) + description: The month to date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month ``` ### Window options @@ -38,28 +82,25 @@ This section details examples of when you specify and don't specify window optio If a window option is specified, the MetricFlow framework applies a sliding window to the underlying measure. -Suppose the underlying measure `distinct_users` is configured as such to reflect a count of distinct users by user_id and user_status. +Suppose the underlying measure `customers` is configured to count the unique customers making orders at the Jaffle shop. ```yaml measures: - - name: distinct_users - description: The number of distinct users creating mql queries - expr: case when user_status in ('PENDING','ACTIVE') then user_id else null end - agg: count_distinct + - name: customers + expr: customer_id + agg: count_distinct + ``` -We can write a cumulative metric `wau_rolling_7` as such: +We can write a cumulative metric `weekly_customers` as such: ``` yaml metrics: - name: wau_rolling_7 - # Define the measure and the window. + - name: weekly_customers # Define the measure and the window. type: cumulative type_params: - measures: - - distinct_users - # the default window is infinity - omitting window will accumulate the measure over all time - window: 7 days + measure: customers + window: 7 days # Setting the window to 7 days since we want to track weekly active ``` From the sample YAML above, note the following: @@ -67,7 +108,7 @@ From the sample YAML above, note the following: * `type`: Specify cumulative to indicate the type of metric. * `type_params`: Specify the measure you want to aggregate as a cumulative metric. You have the option of specifying a `window`, or a `grain to date`. -For example, in the `wau_rolling_7` cumulative metric, MetricFlow takes a sliding 7-day window of relevant users and applies a count distinct function. +For example, in the `weekly_customers` cumulative metric, MetricFlow takes a sliding 7-day window of relevant customers and applies a count distinct function. If you omit the `window`, the measure will accumulate over all time. Otherwise, you can choose from granularities like day, week, quarter, or month, and describe the window using phrases like "7 days" or "1 month." @@ -86,32 +127,32 @@ Suppose you (a subscription-based company for the sake of this example) have an * `event_type`: (integer) a column that populates with +1 to indicate an added subscription, or -1 to indicate a deleted subscription. * `revenue`: (integer) a column that multiplies `event_type` and `subscription_revenue` to depict the amount of revenue added or lost for a specific date. -Using cumulative metrics without specifying a window, you can calculate running totals for metrics like the count of active subscriptions and revenue at any point in time. The following configuration YAML displays creating such cumulative metrics to obtain current revenue or total number of active subscriptions as a cumulative sum: +Using cumulative metrics without specifying a window, you can calculate running totals for metrics like the count of active subscriptions and revenue at any point in time. The following configuration YAML displays creating such cumulative metrics to obtain current revenue or the total number of active subscriptions as a cumulative sum: ```yaml -measures: - - name: revenue - description: Total revenue - agg: sum - expr: revenue - - name: subscription_count - description: Count of active subscriptions - agg: sum +measures: + - name: revenue + description: Total revenue + agg: sum + expr: revenue + - name: subscription_count + description: Count of active subscriptions + agg: sum expr: event_type +metrics: + - name: current_revenue + description: Current revenue + label: Current Revenue + type: cumulative + type_params: + measure: revenue + - name: active_subscriptions + description: Count of active subscriptions + label: Active Subscriptions + type: cumulative + type_params: + measure: subscription_count -metrics: -- name: current_revenue - description: Current revenue - type: cumulative - type_params: - measures: - - revenue -- name: active_subscriptions - description: Count of active subscriptions - type: cumulative - type_params: - measures: - - subscription_count ``` @@ -122,38 +163,32 @@ metrics: You can choose to specify a grain to date in your cumulative metric configuration to accumulate a metric from the start of a grain (such as week, month, or year). When using a window, such as a month, MetricFlow will go back one full calendar month. However, grain to date will always start accumulating from the beginning of the grain, regardless of the latest date of data. -For example, let's consider an underlying measure of `total_revenue.` +For example, let's consider an underlying measure of `order_total.` ```yaml -measures: - - name: total_revenue - description: Total revenue (summed) - agg: sum - expr: revenue + measures: + - name: order_total + agg: sum ``` We can compare the difference between a 1-month window and a monthly grain to date. The cumulative metric in a window approach applies a sliding window of 1 month, whereas the grain to date by month resets at the beginning of each month. ```yaml -metrics: - name: revenue_monthly_window #For this metric, we use a window of 1 month - description: Monthly revenue using a window of 1 month (think of this as a sliding window of 30 days) - type: cumulative - type_params: - measures: - - total_revenue - window: 1 month -``` - -```yaml -metrics: - name: revenue_monthly_grain_to_date #For this metric, we use a monthly grain to date - description: Monthly revenue using grain to date of 1 month (think of this as a monthly resetting point) - type: cumulative - type_params: - measures: - - total_revenue - grain_to_date: month +metrics: + - name: cumulative_order_total_l1m #For this metric, we use a window of 1 month + label: Cumulative Order total (L1M) + description: Trailing 1 month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month + - name: cumulative_order_total_mtd #For this metric, we use a monthly grain to date + label: Cumulative Order total (MTD) + description: The month to date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month ``` ### Implementation diff --git a/website/docs/docs/build/custom-aliases.md b/website/docs/docs/build/custom-aliases.md index 589d64f8510..326434ea922 100644 --- a/website/docs/docs/build/custom-aliases.md +++ b/website/docs/docs/build/custom-aliases.md @@ -168,8 +168,8 @@ New in v1.5 **Related documentation:** -- [Model versions](govern/model-versions) -- [`versions`](resource-properties/versions#alias) +- [Model versions](/docs/collaborate/govern/model-versions) +- [`versions`](/reference/resource-properties/versions#alias) By default, dbt will create versioned models with the alias `_v`, where `` is that version's unique identifier. You can customize this behavior just like for non-versioned models by configuring a custom `alias` or re-implementing the `generate_alias_name` macro. diff --git a/website/docs/docs/build/custom-databases.md b/website/docs/docs/build/custom-databases.md index 300fd3147f1..dd54d6998e8 100644 --- a/website/docs/docs/build/custom-databases.md +++ b/website/docs/docs/build/custom-databases.md @@ -54,8 +54,6 @@ select * from ... ### generate_database_name -New in v0.16.0 - The database name generated for a model is controlled by a macro called `generate_database_name`. This macro can be overridden in a dbt project to change how dbt generates model database names. This macro works similarly to the [generate_schema_name](/docs/build/custom-schemas#advanced-custom-schema-configuration) macro. To override dbt's database name generation, create a macro named `generate_database_name` in your own dbt project. The `generate_database_name` macro accepts two arguments: diff --git a/website/docs/docs/build/custom-schemas.md b/website/docs/docs/build/custom-schemas.md index b8dbb9a0846..ad9fe997483 100644 --- a/website/docs/docs/build/custom-schemas.md +++ b/website/docs/docs/build/custom-schemas.md @@ -103,7 +103,7 @@ To modify how dbt generates schema names, you should add a macro named `generate If you're modifying how dbt generates schema names, don't just replace ```{{ default_schema }}_{{ custom_schema_name | trim }}``` with ```{{ custom_schema_name | trim }}``` in the ```generate_schema_name``` macro. -Removing ```{{ default_schema }}``` causes developers to overriding each other's models when custom schemas are defined. This can also cause issues during development and continuous integration (CI). +If you remove ```{{ default_schema }}```, it causes developers to override each other's models if they create their own custom schemas. This can also cause issues during development and continuous integration (CI). ❌ The following code block is an example of what your code _should not_ look like: ```sql @@ -180,13 +180,6 @@ The following context methods _are_ available in the `generate_schema_name` macr ### Which vars are available in generate_schema_name? - - -Variable semantics have changed in dbt v0.17.0. See the [migration guide](/guides/migration/versions) -for more information on these changes. - - - Globally-scoped variables and variables defined on the command line with [--vars](/docs/build/project-variables) are accessible in the `generate_schema_name` context. diff --git a/website/docs/docs/build/derived-metrics.md b/website/docs/docs/build/derived-metrics.md index 0ca14d1c6f2..2ad1c3e368c 100644 --- a/website/docs/docs/build/derived-metrics.md +++ b/website/docs/docs/build/derived-metrics.md @@ -6,35 +6,154 @@ sidebar_label: Derived tags: [Metrics, Semantic Layer] --- -Derived metrics in MetricFlow refer to metrics that are created by defining an expression using other metrics. Derived metrics allow for calculations on top of metrics. For example, you can define a metric called "Net Sales Per User" by using other metrics in the calculation. +In MetricFlow, derived metrics are metrics created by defining an expression using other metrics. They enable you to perform calculations with existing metrics. This is helpful for combining metrics and doing math functions on aggregated columns, like creating a profit metric. + + The parameters, description, and type for derived metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `expr` | The derived expression. | Required | +| `metrics` | The list of metrics used in the derived metrics. | Required | +| `alias` | Optional alias for the metric that you can use in the expr. | Optional | +| `filter` | Optional filter to apply to the metric. | Optional | +| `offset_window` | Set the period for the offset window, such as 1 month. This will return the value of the metric one month from the metric time. | Required | + +The following displays the complete specification for derived metrics, along with an example. ```yaml metrics: - - name: net_sales_per_user + - name: the metric name # Required + description: the metric description # Optional + type: derived # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + expr: the derived expression # Required + metrics: # The list of metrics used in the derived metrics # Required + - name: the name of the metrics. must reference a metric you have already defined # Required + alias: optional alias for the metric that you can use in the expr # Optional + filter: optional filter to apply to the metric # Optional + offset_window: set the period for the offset window, such as 1 month. This will return the value of the metric one month from the metric time. # Required +``` + +## Derived metrics example + +```yaml +metrics: + - name: order_gross_profit + description: Gross profit from each order. + type: derived + label: Order Gross Profit + type_params: + expr: revenue - cost + metrics: + - name: order_total + alias: revenue + - name: order_cost + alias: cost + - name: food_order_gross_profit + label: Food Order Gross Profit + description: "The gross profit for each food order." type: derived type_params: - expr: gross_sales - cogs / active_users + expr: revenue - cost metrics: - - name: gross_sales # these are all metrics (can be a derived metric, meaning building a derived metric with derived metrics) - - name: cogs - - name: users - filter: | # Optional additional constraint - {{dimension('filter')}} is_active - alias: active_users # Optional alias to use in the expr + - name: order_total + alias: revenue + filter: | + {{ Dimension('order__is_food_order') }} = True + - name: order_cost + alias: cost + filter: | + {{ Dimension('order__is_food_order') }} = True + - name: order_total_growth_mom + description: "Percentage growth of orders total completed to 1 month ago" + type: derived + label: Order Total Growth % M/M + type_params: + expr: (order_total - order_total_prev_month)*100/order_total_prev_month + metrics: + - name: order_total + - name: order_total + offset_window: 1 month + alias: order_total_prev_month ``` ## Derived metric offset -You may want to use an offset value of a metric in the definition of a derived metric. For example, if you define retention rate as (active customers at the end of the month/active customers at the beginning of the month)-1 you can model this using a derived metric with an offset. +To perform calculations using a metric's value from a previous time period, you can add an offset parameter to a derived metric. For example, if you want to calculate period-over-period growth or track user retention, you can use this metric offset. + +**Note:** You must include the [`metric_time` dimension](/docs/build/dimensions#time) when querying a derived metric with an offset window. + +The following example displays how you can calculate monthly revenue growth using a 1-month offset window: ```yaml -metrics: -- name: user_retention - type: derived +- name: customer_retention + description: Percentage of customers that are active now and those active 1 month ago + label: customer_retention type_params: - expr: active_customers/active_customers_t1m + expr: (active_customers/ active_customers_prev_month) metrics: - - name: active_customers # these are all metrics (can be a derived metric, meaning building a derived metric with derived metrics) + - name: active_customers + alias: current_active_customers - name: active_customers offset_window: 1 month - alias: active_customers_t1m + alias: active_customers_prev_month +``` + +### Offset windows and granularity + +You can query any granularity and offset window combination. The following example queries a metric with a 7-day offset and a monthly grain: + +```yaml +- name: d7_booking_change + description: Difference between bookings now and 7 days ago + type: derived + label: d7 Bookings Change + type_params: + expr: bookings - bookings_7_days_ago + metrics: + - name: bookings + alias: current_bookings + - name: bookings + offset_window: 7 days + alias: bookings_7_days_ago +``` + +When you run the query `mf query --metrics d7_booking_change --group-by metric_time__month` for the metric, here's how it's calculated: + +1. We retrieve the raw, unaggregated dataset with the specified measures and dimensions at the smallest level of detail, which is currently 'day'. +2. Then, we perform an offset join on the daily dataset, followed by performing a date trunc and aggregation to the requested granularity. + For example, to calculate `d7_booking_change` for July 2017: + - First, we sum up all the booking values for each day in July to calculate the bookings metric. + - The following table displays the range of days that make up this monthly aggregation. + +| | Orders | Metric_time | +| - | ---- | -------- | +| | 330 | 2017-07-31 | +| | 7030 | 2017-07-30 to 2017-07-02 | +| | 78 | 2017-07-01 | +| Total | 7438 | 2017-07-01 | + +3. Next, we calculate July's bookings with a 7-day offset. The following table displays the range of days that make up this monthly aggregation. Note that the month begins 7 days later (offset by 7 days) on 2017-07-24. + +| | Orders | Metric_time | +| - | ---- | -------- | +| | 329 | 2017-07-24 | +| | 6840 | 2017-07-23 to 2017-06-30 | +| | 83 | 2017-06-24 | +| Total | 7252 | 2017-07-01 | + +4. Lastly, we calculate the derived metric and return the final result set: + +```bash +bookings - bookings_7_days_ago would be compile as 7438 - 7252 = 186. +``` + +| d7_booking_change | metric_time__month | +| ----------------- | ------------------ | +| 186 | 2017-07-01 | diff --git a/website/docs/docs/build/dimensions.md b/website/docs/docs/build/dimensions.md index abe769e54a1..49ae9045021 100644 --- a/website/docs/docs/build/dimensions.md +++ b/website/docs/docs/build/dimensions.md @@ -8,13 +8,35 @@ tags: [Metrics, Semantic Layer] Dimensions is a way to group or filter information based on categories or time. It's like a special label that helps organize and analyze data. -In a data platform, dimensions is part of a larger structure called a semantic model. It's created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures), and used to add more details to your data that can't be easily added up or combined. In SQL, dimensions is typically included in the `dimensions` clause of your SQL query. +In a data platform, dimensions is part of a larger structure called a semantic model. It's created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures), and used to add more details to your data that can't be easily added up or combined. In SQL, dimensions is typically included in the `group by` clause of your SQL query. + -Refer to the following semantic model example: +All dimensions require a `name`, `type` and in some cases, an `expr` parameter. + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | Refers to the name of the group that will be visible to the user in downstream tools. It can also serve as an alias if the column name or SQL query reference is different and provided in the `expr` parameter.

Dimension names should be unique within a semantic model, but they can be non-unique across different models as MetricFlow uses [joins](/docs/build/join-logic) to identify the right dimension. | Required | +| `type` | Specifies the type of group created in the semantic model. There are three types:

- **Categorical**: Group rows in a table by categories like geography, color, and so on.
- **Time**: Point to a date field in the data platform. Must be of type TIMESTAMP or equivalent in the data platform engine.
- **Slowly-changing dimensions**: Analyze metrics over time and slice them by groups that change over time, like sales trends by a customer's country. | Required | +| `type_params` | Specific type params such as if the time is primary or used as a partition | Required | +| `description` | A clear description of the dimension | Optional | +| `expr` | Defines the underlying column or SQL query for a dimension. If no `expr` is specified, MetricFlow will use the column with the same name as the group. You can use column name itself to input a SQL expression. | Optional | + +Refer to the following for the complete specification for dimensions: + +```yaml +dimensions: + - name: name of the group that will be visible to the user in downstream tools + type: Categorical or Time + type_params: specific type params such as if the time is primary or used as a partition + description: same as always + expr: the column name or expression. If not provided the default is the dimension name +``` + +Refer to the following example to see how dimensions are used in a semantic model: ```yaml semantic_models: @@ -26,7 +48,6 @@ semantic_models: # --- entities --- entities: ... - # --- measures --- measures: ... @@ -40,21 +61,28 @@ semantic_models: expr: case when quantity > 10 then true else false end ``` -All dimensions require a `name`, `type` and in most cases, an `expr` parameter. +MetricFlow requires that all dimensions have a primary entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. -| Name | Parameter | Field type | -| --- | --- | --- | -| `name` | Refers to the name of the group that will be visible to the user in downstream tools. It can also serve as an alias if the column name or SQL query reference is different and provided in the `expr` parameter.

— dimensions names should be unique within a semantic model, but they can be non-unique across different models as MetricFlow uses [joins](/docs/build/join-logic) to identify the right dimension. | Required | -| `type` | Specifies the type of group created in the semantic model. There are three types:

— Categorical: Group rows in a table by categories like geography, product type, color, and so on.
— Time: Point to a date field in the data platform, and must be of type TIMESTAMP or equivalent in the data platform engine.
— Slowly-changing dimensions: Analyze metrics over time and slice them by groups that change over time, like sales trends by a customer's country. | Required | -| `expr` | Defines the underlying column or SQL query for a dimension. If no `expr` is specified, MetricFlow will use the column with the same name as the group. You can use column name itself to input a SQL expression. | Optional | +```yaml +semantic_model: + name: bookings_monthly_source + description: bookings_monthly_source + defaults: + agg_time_dimension: ds + model: ref('bookings_monthly_source') + measures: + - name: bookings_monthly + agg: sum + create_metric: true + primary_entity: booking_id +``` ## Dimensions types -Dimensions has three types. This section further explains the definitions and provides examples. +Dimensions have 2 types. This section further explains the definitions and provides examples. 1. [Categorical](#categorical) 1. [Time](#time) -1. [Slowly changing](#scd-type-ii) ### Categorical @@ -69,62 +97,35 @@ dimensions: ### Time -Time has additional parameters specified under the `type_params` section. :::tip use datetime data type if using BigQuery To use BigQuery as your data platform, time dimensions columns need to be in the datetime data type. If they are stored in another type, you can cast them to datetime using the `expr` property. Time dimensions are used to group metrics by different levels of time, such as day, week, month, quarter, and year. MetricFlow supports these granularities, which can be specified using the `time_granularity` parameter. ::: - - - - -To specify the default time dimensions for a measure or metric in MetricFlow, set the `is_primary` parameter to True. If you have multiple time dimensions in your semantic model, the non-primary ones should have `is_primary` set to False. To assign a non-primary time dimensions to a measure, use the `agg_time_dimension` parameter and refer to the time dimensions defined in the section. +Time has additional parameters specified under the `type_params` section. When you query one or more metrics in MetricFlow using the CLI, the default time dimension for a single metric is the primary time dimension, which you can refer to as `metric_time` or use the dimensions' name. -In the provided example, the semantic model has two time groups, `created_at` and `deleted_at`, with `created_at` being the primary time dimensions through `is_primary: True`. The `users_created` measure defaults to the primary time dimensions, while the `users_deleted` measure uses `deleted_at` as its time group. +You can use multiple time groups in separate metrics. For example, the `users_created` metric uses `created_at`, and the `users_deleted` metric uses `deleted_at`: -```yaml -dimensions: - - name: created_at - type: time - expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table - is_partition: True - type_params: - is_primary: True - time_granularity: day - - name: deleted_at - type: time - expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table - is_partition: True - type_params: - is_primary: False - time_granularity: day -measures: - - name: users_deleted - expr: 1 - agg: sum - agg_time_dimension: deleted_at - - name: users_created - expr: 1 - agg: sum +```bash +mf query --metrics users_created,users_deleted --dimensions metric_time --order metric_time ``` -When querying one or more metrics in MetricFlow using the CLI, the default time dimensions for a single metric is the primary time dimension, which can be referred to as metric_time or the dimensions's name. Multiple time groups can be used in separate metrics, such as users_created which uses created_at, and users_deleted which uses deleted_at. +You can set `is_partition` for time or categorical dimensions to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation detail (like daily, weekly, and so on): - ``` - mf query --metrics users_created,users_deleted --dimensions metric_time --order metric_time - ``` + - + - +Use `is_partition: True` to show that a dimension exists over a specific time window. For example, a date-partitioned dimensional table. When you query metrics from different tables, the dbt Semantic Layer uses this parameter to ensure that the correct dimensional values are joined to measures. -`time_granularity` specifies the smallest level of detail that a measure or metric should be reported at, such as daily, weekly, monthly, quarterly, or yearly. Different granularity options are available, and each metric must have a specified granularity. For example, a metric that is specified with weekly granularity couldn't be aggregated to a daily grain. +You can also use `is_partition` for [categorical](#categorical) dimensions as well. -The current options for time granularity are day, week, month, quarter, and year. +MetricFlow enables metric aggregation during query time. For example, you can aggregate the `messages_per_month` measure. If you originally had a `time_granularity` for the time dimensions `metric_time`, you can specify a yearly granularity for aggregation in your CLI query: -Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the highest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. +```bash +mf query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year +``` ```yaml dimensions: @@ -133,14 +134,12 @@ dimensions: expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table is_partition: True type_params: - is_primary: True time_granularity: day - name: deleted_at type: time expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table is_partition: True type_params: - is_primary: False time_granularity: day measures: @@ -155,16 +154,13 @@ measures: - - -Use `is_partition: True` to indicate that a dimension exists over a specific time window. For example, a date-partitioned dimensional table. When you query metrics from different tables, the Semantic Layer will use this parameter to ensure that the correct dimensional values are joined to measures. + -In addition, MetricFlow allows for easy aggregation of metrics at query time. For example, you can aggregate the `messages_per_month` measure, where the original `time_granularity` of the time dimensions `metrics_time`, at a yearly granularity by specifying it in the query in the CLI. +`time_granularity` specifies the smallest level of detail that a measure or metric should be reported at, such as daily, weekly, monthly, quarterly, or yearly. Different granularity options are available, and each metric must have a specified granularity. For example, a metric that is specified with weekly granularity couldn't be aggregated to a daily grain. -``` -mf query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year -``` +The current options for time granularity are day, week, month, quarter, and year. +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the highest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. ```yaml dimensions: @@ -173,20 +169,18 @@ dimensions: expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table is_partition: True type_params: - is_primary: True time_granularity: day - name: deleted_at type: time expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table is_partition: True type_params: - is_primary: False time_granularity: day measures: - name: users_deleted expr: 1 - agg: sum + agg: sum agg_time_dimension: deleted_at - name: users_created expr: 1 @@ -197,14 +191,13 @@ measures: - ### SCD Type II :::caution Currently, there are limitations in supporting SCD's. ::: -MetricFlow, supports joins against dimensions values in a semantic model built on top of an SCD Type II table (slowly changing dimension) Type II table. This is useful when you need a particular metric sliced by a group that changes over time, such as the historical trends of sales by a customer's country. +MetricFlow supports joins against dimensions values in a semantic model built on top of an SCD Type II table (slowly changing dimension) Type II table. This is useful when you need a particular metric sliced by a group that changes over time, such as the historical trends of sales by a customer's country. As their name suggests SCD Type II are groups that change values at a coarser time granularity. This results in a range of valid rows with different dimensions values for a given metric or measure. MetricFlow associates the metric with the first (minimum) available dimensions value within a coarser time window, such as month. By default, MetricFlow uses the group that is valid at the beginning of the time granularity. @@ -219,7 +212,7 @@ The following basic structure of an SCD Type II data platform table is supported **Note**: The SCD dimensions table must have `valid_to` and `valid_from` columns. -This is an example of SQL code that shows how a sample metric called `num_events` is joined with versioned dimensions data (stored in a table called `scd_dimensions`) using a natural key made up of the `entity_key` and `timestamp` columns. +This is an example of SQL code that shows how a sample metric called `num_events` is joined with versioned dimensions data (stored in a table called `scd_dimensions`) using a primary key made up of the `entity_key` and `timestamp` columns. ```sql @@ -252,7 +245,7 @@ Take note of the extra arguments under `validity_params`: `is_start` and `is_end ```yaml semantic_models: - name: sales_person_tiers - description: SCD Type II table of tiers for sales people + description: SCD Type II table of tiers for salespeople model: {{ref(sales_person_tiers)}} defaults: agg_time_dimension: tier_start @@ -277,7 +270,7 @@ semantic_models: entities: - name: sales_person - type: natural + type: primary expr: sales_person_id ``` @@ -337,7 +330,7 @@ In the sales tier example, For instance, if a salesperson was Tier 1 from 2022- -This example shows how to create slowly changing dimensions (SCD) using a semantic model. The SCD table contains information about sales persons' tier and the time length of that tier. Suppose you have the underlying SCD table: +This example shows how to create slowly changing dimensions (SCD) using a semantic model. The SCD table contains information about salespersons' tier and the time length of that tier. Suppose you have the underlying SCD table: | sales_person_id | tier | start_date | end_date | |-----------------|------|------------|----------| diff --git a/website/docs/docs/build/entities.md b/website/docs/docs/build/entities.md index 1e7f2ff878d..464fa2c3b8c 100644 --- a/website/docs/docs/build/entities.md +++ b/website/docs/docs/build/entities.md @@ -28,6 +28,17 @@ MetricFlow's join logic depends on the entity `type` you use, and it also determ * **Foreign —** A foreign key can include zero, one, or multiple instances of the same record. Null values may also be present. * **Natural —** Natural keys are column or combination of columns in a table that uniquely identify a record based on real-world data. For instance, in a sales_person_department dimension table, the sales_person_id can serve as a natural key. +The complete spec for entities is below: +```yaml +entities: + - name: transaction ## Required + type: primary or natural or foreign or unique ## Required + description: a description of the field or role the entity takes in this table ## Optional + expr: the field that denotes that entity (transaction_id). ## Optional + If not specified will default to name + +``` + Here's an example of how to define entities in a semantic model: ``` yaml diff --git a/website/docs/docs/build/exposures.md b/website/docs/docs/build/exposures.md index f58903a9726..65c0792e0a0 100644 --- a/website/docs/docs/build/exposures.md +++ b/website/docs/docs/build/exposures.md @@ -4,13 +4,6 @@ sidebar_label: "Exposures" id: "exposures" --- - - -* **v0.18.1**: Exposures are new! -* **v0.20.0**: Exposures support `tags` and `meta` properties - - - Exposures make it possible to define and describe a downstream use of your dbt project, such as in a dashboard, application, or data science pipeline. By defining exposures, you can then: - run, test, and list resources that feed into your exposure - populate a dedicated page in the auto-generated [documentation](/docs/collaborate/documentation) site with context relevant to data consumers diff --git a/website/docs/docs/build/groups.md b/website/docs/docs/build/groups.md index aa33db07ccc..7ac5337ba0d 100644 --- a/website/docs/docs/build/groups.md +++ b/website/docs/docs/build/groups.md @@ -1,6 +1,6 @@ --- title: "Add groups to your DAG" -sidebar_title: "Groups" +sidebar_label: "Groups" id: "groups" description: "When you define groups in dbt projects, you turn implicit relationships into an explicit grouping." keywords: diff --git a/website/docs/docs/build/hooks-operations.md b/website/docs/docs/build/hooks-operations.md index 1abc5657bad..85378498a36 100644 --- a/website/docs/docs/build/hooks-operations.md +++ b/website/docs/docs/build/hooks-operations.md @@ -4,6 +4,8 @@ description: "Read this tutorial to learn how to use hooks and operations when b id: "hooks-operations" --- +import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md'; + ## Related documentation * [pre-hook & post-hook](/reference/resource-configs/pre-hook-post-hook) * [on-run-start & on-run-end](/reference/project-configs/on-run-start-on-run-end) @@ -33,8 +35,8 @@ dbt provides hooks and operations so you can version control and execute these s Hooks are snippets of SQL that are executed at different times: * `pre-hook`: executed _before_ a model, seed or snapshot is built. * `post-hook`: executed _after_ a model, seed or snapshot is built. - * `on-run-start`: executed at the _start_ of `dbt run`, `dbt test`, `dbt seed` or `dbt snapshot` - * `on-run-end`: executed at the _end_ of `dbt run`, `dbt test`, `dbt seed` or `dbt snapshot` + * `on-run-start`: executed at the _start_ of + * `on-run-end`: executed at the _end_ of Hooks are a more-advanced capability that enable you to run custom SQL, and leverage database-specific actions, beyond what dbt makes available out-of-the-box with standard materializations and configurations. @@ -68,127 +70,6 @@ You can use hooks to provide database-specific functionality not available out-o -
- - - -### Examples using hooks - -Here's a minimal example of using hooks to grant privileges. For more information, see [`on-run-start` & `on-run-end` hooks](/reference/project-configs/on-run-start-on-run-end) and [`pre-hook` & `post-hook`](/reference/resource-configs/pre-hook-post-hook) reference sections. - - - -```yml -on-run-end: - - "grant usage on {{ target.schema }} to role reporter" - -models: - +post-hook: - - "grant select on {{ this }} to role reporter" - -``` - - - -You can also apply the `post-hook` to individual models using a `config` block: - - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to role reporter" - ] -) }} - -select ... - -``` - - - -You should use database-specific syntax when appropriate: - - - -
- - - -```sql -{{ config( - post_hook=[ - 'grant `roles/bigquery.dataViewer` on {{ this.type }} {{ this }} to "user:someone@yourcompany.com"' - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to `someone@yourcompany.com`" - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to reporter" - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to role reporter" - ] -) }} - -select ... - -``` - - - -
- -
-
### Calling a macro in a hook diff --git a/website/docs/docs/build/incremental-models.md b/website/docs/docs/build/incremental-models.md index 29c7c8c585f..07a571cd4db 100644 --- a/website/docs/docs/build/incremental-models.md +++ b/website/docs/docs/build/incremental-models.md @@ -79,14 +79,6 @@ A `unique_key` enables updating existing rows instead of just appending new rows Not specifying a `unique_key` will result in append-only behavior, which means dbt inserts all rows returned by the model's SQL into the preexisting target table without regard for whether the rows represent duplicates. - - -The optional `unique_key` parameter specifies a field that can uniquely identify each row within your model. You can define `unique_key` in a configuration block at the top of your model. If your model doesn't contain a single field that is unique, but rather a combination of columns, we recommend that you create a single column that can serve as a unique identifier (by concatenating and hashing those columns), and pass it into your model's configuration. - - - - - The optional `unique_key` parameter specifies a field (or combination of fields) that define the grain of your model. That is, the field(s) identify a single unique row. You can define `unique_key` in a configuration block at the top of your model, and it can be a single column name or a list of column names. The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](/terms/surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). @@ -101,14 +93,12 @@ When you pass a list in this way, please ensure that each column does not contai Alternatively, you can define a single-column [surrogate key](/terms/surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). ::: - - When you define a `unique_key`, you'll see this behavior for each row of "new" data returned by your dbt model: * If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database, [incremental strategy](#about-incremental_strategy), and [strategy specific configs](#strategy-specific-configs). * If the `unique_key` is _not_ present in the "old" data, dbt will insert the entire row into the table. -Please note that if there's a `unique_key` with more than one row in either the existing target table or the new incremental rows, the incremental model run will fail. Your database and [incremental strategy](#about-incremental_strategy) will determine the specific error that you see, so if you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](/terms/surrogate-key). +Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](#about-incremental_strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](/terms/surrogate-key). :::info While common incremental strategies, such as`delete+insert` + `merge`, might use `unique_key`, others don't. For example, the `insert_overwrite` strategy does not use `unique_key`, because it operates on partitions of data rather than individual rows. For more information, see [About incremental_strategy](#about-incremental_strategy). @@ -395,7 +385,7 @@ models: cluster_by: ['session_start'] incremental_strategy: merge # this limits the scan of the existing table to the last 7 days of data - incremental_predicates: ["DBT_INTERNAL_DEST.session_start > datediff(day, -7, current_date)"] + incremental_predicates: ["DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date)"] # `incremental_predicates` accepts a list of SQL statements. # `DBT_INTERNAL_DEST` and `DBT_INTERNAL_SOURCE` are the standard aliases for the target table and temporary table, respectively, during an incremental run using the merge strategy. ``` @@ -412,7 +402,7 @@ Alternatively, here are the same same configurations configured within a model f cluster_by = ['session_start'], incremental_strategy = 'merge', incremental_predicates = [ - "DBT_INTERNAL_DEST.session_start > datediff(day, -7, current_date)" + "DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date)" ] ) }} @@ -430,7 +420,7 @@ merge into DBT_INTERNAL_DEST DBT_INTERNAL_DEST.id = DBT_INTERNAL_SOURCE.id and -- custom predicate: limits data scan in the "old" data / existing table - DBT_INTERNAL_DEST.session_start > datediff(day, -7, current_date) + DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date) when matched then update ... when not matched then insert ... ``` diff --git a/website/docs/docs/build/jinja-macros.md b/website/docs/docs/build/jinja-macros.md index 538a3a5e4c6..44bc85872f5 100644 --- a/website/docs/docs/build/jinja-macros.md +++ b/website/docs/docs/build/jinja-macros.md @@ -126,7 +126,7 @@ from app_data.payments ### Using a macro from a package -A number of useful macros have also been grouped together into [packages](docs/build/packages) — our most popular package is [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). +A number of useful macros have also been grouped together into [packages](/docs/build/packages) — our most popular package is [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). After installing a package into your project, you can use any of the macros in your own project — make sure you qualify the macro by prefixing it with the [package name](/reference/dbt-jinja-functions/project_name): diff --git a/website/docs/docs/build/join-logic.md b/website/docs/docs/build/join-logic.md index eb4e02ed423..9039822c9fd 100644 --- a/website/docs/docs/build/join-logic.md +++ b/website/docs/docs/build/join-logic.md @@ -60,7 +60,7 @@ semantic_models: - name: average_purchase_price agg: avg expr: purchase_price - - name: user_signup + - name: user_signup entities: - name: user type: primary @@ -114,8 +114,7 @@ semantic_models: - name: metric_time type: time type_params: - is_primary: true - - name: user_signup + - name: user_signup entities: - name: user_id type: primary diff --git a/website/docs/docs/build/materializations.md b/website/docs/docs/build/materializations.md index 70c7878bd69..463651ccc77 100644 --- a/website/docs/docs/build/materializations.md +++ b/website/docs/docs/build/materializations.md @@ -5,12 +5,13 @@ id: "materializations" --- ## Overview -Materializations are strategies for persisting dbt models in a warehouse. There are four types of materializations built into dbt. They are: +Materializations are strategies for persisting dbt models in a warehouse. There are five types of materializations built into dbt. They are: - - - incremental - ephemeral +- materialized view ## Configuring materializations @@ -82,7 +83,7 @@ When using the `table` materialization, your model is rebuilt as a +Options: + --search TEXT Filter available metrics by this search term + --show-all-dimensions Show all dimensions associated with a metric. + --help Show this message and exit. +``` + +## List dimensions + +This command lists all unique dimensions for a metric or multiple metrics. It displays only common dimensions when querying multiple metrics: + +```bash +mf list dimensions --metrics +Options: + --metrics SEQUENCE List dimensions by given metrics (intersection). Ex. + --metrics bookings,messages + --help Show this message and exit. +``` + +## List dimension-values + +This command lists all dimension values with the corresponding metric: + +```bash +mf list dimension-values --metrics --dimension +Options: + --dimension TEXT Dimension to query values from [required] + --metrics SEQUENCE Metrics that are associated with the dimension + [required] + --end-time TEXT Optional iso8601 timestamp to constraint the end time of + the data (inclusive) + --start-time TEXT Optional iso8601 timestamp to constraint the start time + of the data (inclusive) + --help Show this message and exit. +``` +## List entities + +This command lists all unique entities: + +```bash +mf list entities --metrics +Options: + --metrics SEQUENCE List entities by given metrics (intersection). Ex. + --metrics bookings,messages + --help Show this message and exit. +``` + +## Validate-configs + +This command performs validations against the defined semantic model configurations: + +```bash +mf validate-configs +Options: + --dw-timeout INTEGER Optional timeout for data warehouse + validation steps. Default None. + --skip-dw If specified, skips the data warehouse + validations + --show-all If specified, prints warnings and future- + errors + --verbose-issues If specified, prints any extra details + issues might have + --semantic-validation-workers INTEGER + Optional. Uses the number of workers + specified to run the semantic validations. + Should only be used for exceptionally large + configs + --help Show this message and exit. +``` + +## Health checks + +This command performs a health check against the data platform you provided in the configs: + +```bash +mf health-checks +``` + +## Tutorial + +Follow the dedicated MetricFlow tutorial to help you get started: + +```bash +mf tutorial +``` + +## Query + +Create a new query with MetricFlow, execute that query against the user's data platform, and return the result: + +```bash +mf query --metrics --group-by + +Options: + + --metrics SEQUENCE Metrics to query for: syntax is --metrics bookings + or for multiple metrics --metrics bookings, messages. + + --group-by SEQUENCE Dimensions and/or entities to group by: syntax is + --group-by ds or for multiple group bys --group-by + ds, org. + + --end-time TEXT Optional iso8601 timestamp to constraint the end + time of the data (inclusive) + + --start-time TEXT Optional iso8601 timestamp to constraint the start + time of the data (inclusive) + + --where TEXT SQL-like where statement provided as a string. For + example: --where "revenue > 100". To add a dimension filter to + a where filter, you have to indicate that the filter item is part of your model. + Refer to the [FAQ](#faqs) for more info on how to do this using a template wrapper. + + --limit TEXT Limit the number of rows out using an int or leave + blank for no limit. For example: --limit 100 + + --order SEQUENCE Metrics or group bys to order by ("-" prefix for + DESC). For example: --order -ds or --order + ds,-revenue + + --csv FILENAME Provide filepath for data frame output to csv + + --explain In the query output, show the query that was + executed against the data warehouse + + --show-dataflow-plan Display dataflow plan in explain output + + --display-plans Display plans (such as metric dataflow) in the browser + + --decimals INTEGER Choose the number of decimal places to round for + the numerical values + + --show-sql-descriptions Shows inline descriptions of nodes in displayed SQL + + --help Show this message and exit. + ``` + + +## Query examples + +The following tabs present various different types of query examples that you can use to query metrics and dimensions. Select the tab that best suits your needs: + + + + + +Use the example to query metrics by dimension and return the `order_total` metric by `metric_time.` + +**Query** +```bash +mf query --metrics order_total --group-by metric_time +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 1.24 seconds +| METRIC_TIME | ORDER_TOTAL | +|:--------------|---------------:| +| 2017-06-16 | 792.17 | +| 2017-06-17 | 458.35 | +| 2017-06-18 | 490.69 | +| 2017-06-19 | 749.09 | +| 2017-06-20 | 712.51 | +| 2017-06-21 | 541.65 | +``` + + + + +You can include multiple dimensions in a query. For example, you can group by the `is_food_order` dimension to confirm if orders were for food or not. + +**Query** +```bash +mf query --metrics order_total --group-by metric_time, is_food_order +``` + +**Result** +```bash + Success 🦄 - query completed after 1.70 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-06-16 | True | 499.27 | +| 2017-06-16 | False | 292.90 | +| 2017-06-17 | True | 431.24 | +| 2017-06-17 | False | 27.11 | +| 2017-06-18 | True | 466.45 | +| 2017-06-18 | False | 24.24 | +| 2017-06-19 | False | 300.98 | +| 2017-06-19 | True | 448.11 | +``` + + + + + + +You can add order and limit functions to filter and present the data in a readable format. The following query limits the data set to 10 records and orders them by `metric_time`, descending. + +**Query** +```bash +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 1.41 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-31 | True | 459.90 | +| 2017-08-31 | False | 327.08 | +| 2017-08-30 | False | 348.90 | +| 2017-08-30 | True | 448.18 | +| 2017-08-29 | True | 479.94 | +| 2017-08-29 | False | 333.65 | +| 2017-08-28 | False | 334.73 | +``` + + + + +You can further filter the data set by adding a `where` clause to your query. + +**Query** + +```bash +mf query --metrics order_total --group-by metric_time --where "{{Dimension('order_id__is_food_order')}} = True" +``` + +**Result** +```bash + ✔ Success 🦄 - query completed after 1.06 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-31 | True | 459.90 | +| 2017-08-30 | True | 448.18 | +| 2017-08-29 | True | 479.94 | +| 2017-08-28 | True | 513.48 | +| 2017-08-27 | True | 568.92 | +| 2017-08-26 | True | 471.95 | +| 2017-08-25 | True | 452.93 | +| 2017-08-24 | True | 384.40 | +| 2017-08-23 | True | 423.61 | +| 2017-08-22 | True | 401.91 | +``` + + + + + +To filter by time, there are dedicated start and end time options. Using these options to filter by time allows MetricFlow to further optimize query performance by pushing down the where filter when appropriate. + +**Query** +```bash + mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' +``` + + **Result** +```bash +✔ Success 🦄 - query completed after 1.53 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-27 | True | 568.92 | +| 2017-08-26 | True | 471.95 | +| 2017-08-25 | True | 452.93 | +| 2017-08-24 | True | 384.40 | +| 2017-08-23 | True | 423.61 | +| 2017-08-22 | True | 401.91 | +``` + + + + + + + +### Additional query examples + +The following tabs present additional query examples, like exporting to a CSV. Select the tab that best suits your needs: + + + + + + + +Add `--explain` to your query to view the SQL generated by MetricFlow. + +**Query** + +```bash + mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --explain +``` + + **Result** + ```bash + ✔ Success 🦄 - query completed after 0.28 seconds +🔎 SQL (remove --explain to see data or add --show-dataflow-plan to see the generated dataflow plan): +SELECT + metric_time + , is_food_order + , SUM(order_cost) AS order_total +FROM ( + SELECT + cast(ordered_at as date) AS metric_time + , is_food_order + , order_cost + FROM ANALYTICS.js_dbt_sl_demo.orders orders_src_1 + WHERE cast(ordered_at as date) BETWEEN CAST('2017-08-22' AS TIMESTAMP) AND CAST('2017-08-27' AS TIMESTAMP) +) subq_3 +WHERE is_food_order = True +GROUP BY + metric_time + , is_food_order +ORDER BY metric_time DESC +LIMIT 10 +``` + + + + + +Add the `--csv file_name.csv` flag to export the results of your query to a csv. + +**Query** + +```bash +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --csv query_example.csv +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 0.83 seconds +🖨 Successfully written query output to query_example.csv +``` + + + + +## Time granularity + +Optionally, you can specify the time granularity you want your data to be aggregated at by appending two underscores and the unit of granularity you want to `metric_time`, the global time dimension. You can group the granularity by: `day`, `week`, `month`, `quarter`, and `year`. + +Below is an example for querying metric data at a monthly grain: + +```bash +mf query --metrics revenue --group-by metric_time__month +``` + +## FAQs + +
+How can I add a dimension filter to a where filter? + +To add a dimension filter to a where filter, you have to indicate that the filter item is part of your model and use a template wrapper: {{Dimension('primary_entity__dimension_name')}}. + +Here's an example query: mf query --metrics order_total --group-by metric_time --where "{{Dimension('order_id__is_food_order')}} = True".

Before using the template wrapper, however, you will need to set up your terminal to escape curly braces for the filter template to work. + +
+How to set up your terminal to escape curly braces? + To configure your .zshrcprofile to escape curly braces, you can use the setopt command to enable the BRACECCL option. This option will cause the shell to treat curly braces as literals and prevent brace expansion. Refer to the following steps to set it up:
+ +1. Open your terminal. +2. Open your .zshrc file using a text editor like nano, vim, or any other text editor you prefer. You can use the following command to open it with nano: + +```bash +nano ~/.zshrc +``` +3. Add the following line to the file: + +```bash +setopt BRACECCL +``` +4. Save and exit the text editor (in `nano`, press Ctrl + O to save, and Ctrl + X to exit). + +5. Source your .zshrc file to apply the changes: + +```bash +source ~/.zshrc +``` + +6. After making these changes, your Zsh shell will treat curly braces as literal characters and will not perform brace expansion. This means that you can use curly braces without worrying about unintended expansions. + +Keep in mind that modifying your shell configuration files can have an impact on how your shell behaves. If you're not familiar with shell configuration, it's a good idea to make a backup of your .zshrc file before making any changes. If you encounter any issues or unexpected behavior, you can revert to the backup. + + +
+ +
diff --git a/website/docs/docs/build/metricflow-time-spine.md b/website/docs/docs/build/metricflow-time-spine.md index 607df692bc9..254fa3cc5f0 100644 --- a/website/docs/docs/build/metricflow-time-spine.md +++ b/website/docs/docs/build/metricflow-time-spine.md @@ -10,12 +10,45 @@ MetricFlow uses a timespine table to construct cumulative metrics. By default, M To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: + + +```sql +{{ + config( + materialized = 'table', + ) +}} + +with days as ( + + {{ + dbt_utils.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2027','mm/dd/yyyy')" + ) + }} + +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * from final +``` + + ```sql --- metricflow_time_spine.sql +-- filename: metricflow_time_spine.sql +-- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery +{{config(materialized='table')}} with days as ( - {{dbt_utils.date_spine('day' - , "to_date('01/01/2000','mm/dd/yyyy')" - , "to_date('01/01/2027','mm/dd/yyyy')" + {{dbt_utils.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2030,01,01)" ) }} ), @@ -28,5 +61,4 @@ final as ( select * from final ``` - You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but it doesn't currently support finer grains. diff --git a/website/docs/docs/build/metrics-overview.md b/website/docs/docs/build/metrics-overview.md index 351c674ca8a..e6d875386ee 100644 --- a/website/docs/docs/build/metrics-overview.md +++ b/website/docs/docs/build/metrics-overview.md @@ -10,22 +10,43 @@ Once you've created your semantic models, it's time to start adding metrics! Met The keys for metrics definitions are: -* `name`: Provide the reference name for the metric. This name must be unique amongst all metrics. -* `type`: Define the type of metric, which can be a measure (`simple`) or ratio (`ratio`)). -* `type_params`: Additional parameters used to configure metrics. `type_params` are different for each metric type. -* `constraint`: For any type of metric, you may optionally include a constraint string, which applies a dimensional filter when computing the metric. You may think of this as your WHERE clause. -* `meta`: Additional metadata you want to add to your metric. +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | Provide the reference name for the metric. This name must be unique amongst all metrics. | Required | +| `description` | Provide the description for your metric. | Optional | +| `type` | Define the type of metric, which can be `simple`, `ratio`, `cumulative`, or `derived`. | Required | +| `type_params` | Additional parameters used to configure metrics. `type_params` are different for each metric type. | Required | +| `configs` | Provide the specific configurations for your metric. | Optional | +| `label` | The display name for your metric. This value will be shown in downstream tools. | Required | +| `filter` | You can optionally add a filter string to any metric type, applying filters to dimensions, entities, or time dimensions during metric computation. Consider it as your WHERE clause. | Optional | +| `meta` | Additional metadata you want to add to your metric. | Optional | + + +Here's a complete example of the metrics spec configuration: + +```yaml +metrics: + - name: metric name ## Required + description: same as always ## Optional + type: the type of the metric ## Required + type_params: ## Required + - specific properties for the metric type + configs: here for `enabled` ## Optional + label: The display name for your metric. This value will be shown in downstream tools. ## Required + filter: | ## Optional + {{ Dimension('entity__name') }} > 0 and {{ Dimension(' entity__another name') }} is not + null +``` This page explains the different supported metric types you can add to your dbt project. - ### Cumulative metrics [Cumulative metrics](/docs/build/cumulative) aggregate a measure over a given window. If no window is specified, the window would accumulate the measure over all time. **Note**m, you will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. @@ -33,15 +54,16 @@ This page explains the different supported metric types you can add to your dbt ```yaml # Cumulative metrics aggregate a measure over a given window. The window is considered infinite if no window parameter is passed (accumulate the measure over all time) metrics: -- name: wau_rolling_7 - owners: - - support@getdbt.com - type: cumulative - type_params: - measures: - - distinct_users + - name: wau_rolling_7 + owners: + - support@getdbt.com + type: cumulative + type_params: + measures: + - distinct_users #Omitting window will accumulate the measure over all time - window: 7 days + window: 7 days + ``` ### Derived metrics @@ -49,15 +71,17 @@ metrics: ```yaml metrics: - - name: net_sales_per_user + - name: order_gross_profit + description: Gross profit from each order. type: derived - type_params: - metrics: - - name: gross_sales # these are all metrics (can be a derived metric, meaning building a derived metric with derived metrics) - - name: cogs - - name: users - filter: is_active # Optional additional constraint - alias: active_users # Optional alias to use in the expr + label: Order Gross Profit + type_params: + expr: revenue - cost + metrics: + - name: order_total + alias: revenue + - name: order_cost + alias: cost ``` ```yaml -metrics: -# Define the reference name of the metric. -# This name must be unique amongst metrics and can include lowercase letters, numbers, and underscores. -# This name is used to call the metric from the dbt Semantic Layer API. - - name: cancellations - type: simple +metrics: + - name: cancellations + type: simple type_params: - # Specify the measure you are creating a proxy for. - measure: cancellations_usd - filter: | - {{dimension('value')}} > 100 and {{dimension('acquisition', entity_path=['user'])}} + measure: cancellations_usd # Specify the measure you are creating a proxy for. + filter: | + {{ Dimension('order__value')}} > 100 and {{Dimension('user__acquisition')}} ``` +## Filters + +A filter is configured using Jinja templating. Use the following syntax to reference entities, dimensions, and time dimensions in filters: +```yaml +filter: | + {{ Entity('entity_name') }} +filter: | + {{ Dimension('primary_entity__dimension_name') }} +filter: | + {{ TimeDimension('time_dimension', 'granularity') }} +``` ### Further configuration You can set more metadata for your metrics, which can be used by other tools later on. The way this metadata is used will vary based on the specific integration partner - **Description** — Write a detailed description of the metric. - + ## Related docs diff --git a/website/docs/docs/build/metrics.md b/website/docs/docs/build/metrics.md index 4ce7372e7d0..7a505fdad14 100644 --- a/website/docs/docs/build/metrics.md +++ b/website/docs/docs/build/metrics.md @@ -4,33 +4,35 @@ id: "metrics" description: "When you define metrics in dbt projects, you encode crucial business logic in tested, version-controlled code. The dbt metrics layer helps you standardize metrics within your organization." keywords: - dbt metrics layer +tags: [Metrics] --- - +:::caution Upgrade to access MetricFlow and the new dbt Semantic Layer -:::info dbt Metrics isn't supported +The dbt_metrics package has been deprecated and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6). If you're using the dbt_metrics package or the legacy Semantic Layer (available on v1.5 or lower), we **highly** recommend [upgrading your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to access MetricFlow and the new [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl?version=1.6). -dbt Metrics is no longer supported in v1.6 and higher. To build your semantic layer, define and query metrics, and provide data governance - refer to [Build your Semantic Layer](/docs/build/build-metrics-intro) for updated guidance. +To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. ::: - - - - -:::info dbt Metrics not recommended + + -dbt Metrics won't be supported in v1.6 and higher, and is being replaced with MetricFlow. [Defining metrics](/docs/build/build-semantic-layer-intro) with MetricFlow will help shape the future of the dbt Semantic Layer — let us know [your thoughts and join the convo](https://github.com/dbt-labs/dbt-core/discussions/7456) to help build it! +The dbt Semantic Layer has undergone a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), improving governance, introducing new APIs, and making it more efficient to define/query metrics. This revamp means the dbt_metrics package and the legacy Semantic Layer, available in dbt v1.5 or lower, are no longer supported and won't receive any code fixes. -::: +**What’s changed?**

+The dbt_metrics package has been [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. We will also remove the dbt_metrics spec and docs when it's fully deprecated. +**Who does this affect?**

+Anyone who uses the dbt_metrics package or is integrated with the legacy Semantic Layer. The new Semantic Layer is available to [Team or Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plans [hosted in North America](/docs/cloud/about-cloud/regions-ip-addresses). You must be on dbt v1.6 or higher to access it. All users can define metrics using MetricFlow. Users on dbt Cloud Developer plans or dbt Core can only use it to define and test metrics locally, but can't dynamically query them with integrated tools. - +**What should you do?**

+If you've defined metrics using dbt_metrics or integrated with the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use MetricFlow or the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. -* **v1.3.0**: Metrics have been moved out of the experimental phase -* **v1.0.0**: Metrics are new and experimental -
+
+ + A metric is an aggregation over a that supports zero or more dimensions. Some examples of metrics include: - active users @@ -59,7 +61,7 @@ You can define metrics in `.yml` files nested under a `metrics:` key. Metric nam - begin with a letter - contain no more than 250 characters -For a short human-friendly name with title casing, spaces, and special characters, use the `label` property. More examples and guidance for how to [define and structure metrics can be found here.](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics). +For a short human-friendly name with title casing, spaces, and special characters, use the `label` property. ### Example definition @@ -218,14 +220,17 @@ Metrics can have many declared **properties**, which define aspects of your metr ### Available calculation methods + The method of calculation (aggregation or derived) that is applied to the expression. + + The type of calculation (aggregation or expression) that is applied to the sql property. -| Metric Calculation Method Metric Type | Description | +| Metric Calculation Method | Description | |----------------|----------------------------------------------------------------------------| | count | This metric type will apply the `count` aggregation to the specified field | | count_distinct | This metric type will apply the `count` aggregation to the specified field, with an additional distinct statement inside the aggregation | @@ -428,6 +433,11 @@ The following is the list of currently accepted metric configs: ## Querying Your Metric + +:::caution dbt_metrics is no longer supported +The dbt_metrics package has been deprecated and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. +::: + You can dynamically query metrics directly in dbt and verify them before running a job in the deployment environment. To query your defined metric, you must have the [dbt_metrics package](https://github.com/dbt-labs/dbt_metrics) installed. Information on how to [install packages can be found here](https://docs.getdbt.com/docs/build/packages#how-do-i-add-a-package-to-my-project). Use the following [metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/) installation code in your packages.yml file and run `dbt deps` to install the metrics package: @@ -452,16 +462,6 @@ packages:
- - -```yml -packages: - - package: dbt-labs/metrics - version: [">=0.2.0", "<0.3.0"] -``` - - - Once the package has been installed with `dbt deps`, make sure to run the `dbt_metrics_default_calendar` model as this is required for macros used to query metrics. More information on this, and additional calendar functionality, can be found in the [project README](https://github.com/dbt-labs/dbt_metrics#calendar). ### Querying metrics with `metrics.calculate` @@ -480,19 +480,6 @@ from {{ metrics.calculate( - - -```sql -select * -from {{ metrics.calculate( - metric_name='new_customers', - grain='week', - dimensions=['plan', 'country'] -) }} -``` - - - ### Supported inputs The example above doesn't display all the potential inputs you can provide to the macro. @@ -501,7 +488,7 @@ You may find some pieces of functionality, like secondary calculations, complica | Input | Example | Description | Required | | ----------- | ----------- | ----------- | -----------| -| metric_listmetric_name | `metric('some_metric)'`,
[`metric('some_metric)'`,
`metric('some_other_metric)'`]
`'metric_name'`
| The metric(s) to be queried by the macro. If multiple metrics required, provide in list format.The name of the metric | Required | +| metric_list | `metric('some_metric)'`,
[`metric('some_metric)'`,
`metric('some_other_metric)'`]
| The metric(s) to be queried by the macro. If multiple metrics required, provide in list format. | Required | | grain | `'day'`, `'week'`,
`'month'`, `'quarter'`,
`'year'`
| The time grain that the metric will be aggregated to in the returned dataset | Optional | | dimensions | [`'plan'`,
`'country'`] | The dimensions you want the metric to be aggregated by in the returned dataset | Optional | | secondary_calculations | [`metrics.period_over_period( comparison_strategy="ratio", interval=1, alias="pop_1wk")`] | Performs the specified secondary calculation on the metric results. Examples include period over period calculations, rolling calculations, and period to date calculations. | Optional | @@ -541,6 +528,7 @@ The period to date secondary calculation performs an aggregation on a defined pe #### Rolling: + The rolling secondary calculation performs an aggregation on a number of rows in metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4` then the value returned will be a rolling 4 week calculation of whatever aggregation type was selected. If the `interval` input is not provided then the rolling caclulation will be unbounded on all preceding rows. | Input | Example | Description | Required | @@ -552,6 +540,7 @@ The rolling secondary calculation performs an aggregation on a number of rows in + The rolling secondary calculation performs an aggregation on a number of rows in the metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4`, then the value returned will be a rolling 4-week calculation of whatever aggregation type was selected. | Input | Example | Description | Required | @@ -651,12 +640,6 @@ from {{ metrics.develop( - - -Functionality for `develop` is only supported in v1.2 and higher. Please navigate to those versions for information about this method of metric development. - - - #### Multiple/Derived Metrics with `metrics.develop` If you have a more complicated use case that you are interested in testing, the develop macro also supports this behavior. The only caveat is that you must include the raw tags for any provided metric yml that contains a derived metric. Example below: @@ -715,4 +698,6 @@ The above example will return a dataset that contains the metric provided in the + + diff --git a/website/docs/docs/build/packages.md b/website/docs/docs/build/packages.md index d4cebc7a6f0..74e25262994 100644 --- a/website/docs/docs/build/packages.md +++ b/website/docs/docs/build/packages.md @@ -48,11 +48,7 @@ packages: - - -- **v1.0.0:** The default [`packages-install-path`](/reference/project-configs/packages-install-path) has been updated to be `dbt_packages` instead of `dbt_modules`. - - +The default [`packages-install-path`](/reference/project-configs/packages-install-path) is `dbt_packages`. 3. Run `dbt deps` to install the package(s). Packages get installed in the `dbt_packages` directory – by default this directory is ignored by git, to avoid duplicating the source code for the package. @@ -89,13 +85,6 @@ In comparison, other package installation methods are unable to handle the dupli #### Prerelease versions - - -* `v0.20.1`: Fixed handling for prerelease versions. Introduced `install-prerelease` parameter. -* `v1.0.0`: When you provide an explicit prerelease version, dbt will install that version. - - - Some package maintainers may wish to push prerelease versions of packages to the dbt Hub, in order to test out new functionality or compatibility with a new version of dbt. A prerelease version is demarcated by a suffix, such as `a1` (first alpha), `b2` (second beta), or `rc3` (third release candidate). By default, `dbt deps` will not include prerelease versions when resolving package dependencies. You can enable the installation of prereleases in one of two ways: @@ -130,12 +119,6 @@ packages: - - -* `v0.20.0`: Introduced the ability to specify commit hashes as package revisions - - - Add the Git URL for the package, and optionally specify a revision. The revision can be: - a branch name - a tagged release @@ -265,12 +248,6 @@ Read more about creating a Personal Access Token [here](https://confluence.atlas #### Configure subdirectory for packaged projects - - -* `v0.20.0`: Introduced the ability to specify `subdirectory` - - - In general, dbt expects `dbt_project.yml` to be located as a top-level file in a package. If the packaged project is instead nested in a subdirectory—perhaps within a much larger mono repo—you can optionally specify the folder path as `subdirectory`. dbt will attempt a [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) of just the files located within that subdirectory. Note that you must be using a recent version of `git` (`>=2.26.0`). @@ -284,18 +261,35 @@ packages: ### Local packages -Packages that you have stored locally can be installed by specifying the path to the project, like so: +A "local" package is a dbt project accessible from your local file system. You can install it by specifying the project's path. It works best when you nest the project within a subdirectory relative to your current project's directory. ```yaml packages: - - local: /opt/dbt/redshift # use a local path + - local: relative/path/to/subdirectory ``` -Local packages should only be used for specific situations, for example, when testing local changes to a package. +Other patterns may work in some cases, but not always. For example, if you install this project as a package elsewhere, or try running it on a different system, the relative and absolute paths will yield the same results. + + + +```yaml +packages: + # not recommended - support for these patterns vary + - local: /../../redshift # relative path to a parent directory + - local: /opt/dbt/redshift # absolute path on the system +``` + + + +There are a few specific use cases where we recommend using a "local" package: +1. **Monorepo** — When you have multiple projects, each nested in a subdirectory, within a monorepo. "Local" packages allow you to combine projects for coordinated development and deployment. +2. **Testing changes** — To test changes in one project or package within the context of a downstream project or package that uses it. By temporarily switching the installation to a "local" package, you can make changes to the former and immediately test them in the latter for quicker iteration. This is similar to [editable installs](https://pip.pypa.io/en/stable/topics/local-project-installs/) in Python. +3. **Nested project** — When you have a nested project that defines fixtures and tests for a project of utility macros, like [the integration tests within the `dbt-utils` package](https://github.com/dbt-labs/dbt-utils/tree/main/integration_tests). + ## What packages are available? Check out [dbt Hub](https://hub.getdbt.com) to see the library of published dbt packages! diff --git a/website/docs/docs/build/projects.md b/website/docs/docs/build/projects.md index a7ca3638590..0d7dd889fa6 100644 --- a/website/docs/docs/build/projects.md +++ b/website/docs/docs/build/projects.md @@ -18,6 +18,7 @@ At a minimum, all a project needs is the `dbt_project.yml` project configuration | [sources](/docs/build/sources) | A way to name and describe the data loaded into your warehouse by your Extract and Load tools. | | [exposures](/docs/build/exposures) | A way to define and describe a downstream use of your project. | | [metrics](/docs/build/metrics) | A way for you to define metrics for your project. | +| [groups](/docs/build/groups) | Groups enable collaborative node organization in restricted collections. | | [analysis](/docs/build/analyses) | A way to organize analytical SQL queries in your project such as the general ledger from your QuickBooks. | When building out the structure of your project, you should consider these impacts on your organization's workflow: diff --git a/website/docs/docs/build/python-models.md b/website/docs/docs/build/python-models.md index 5b9222ad1c5..bff65362d06 100644 --- a/website/docs/docs/build/python-models.md +++ b/website/docs/docs/build/python-models.md @@ -16,11 +16,15 @@ We encourage you to: dbt Python (`dbt-py`) models can help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open-source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. Python transformations defined in dbt are models in your project with all the same capabilities around testing, documentation, and lineage. + Python models are supported in dbt Core 1.3 and higher. Learn more about [upgrading your version in dbt Cloud](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions) and [upgrading dbt Core versions](https://docs.getdbt.com/docs/core-versions#upgrading-to-new-patch-versions). To read more about Python models, change the [docs version to 1.3](/docs/build/python-models?version=1.3) (or higher) in the menu bar. + + + @@ -146,7 +150,7 @@ with upstream_python_model as ( :::caution -Referencing [ephemeral](docs/build/materializations#ephemeral) models is currently not supported (see [feature request](https://github.com/dbt-labs/dbt-core/issues/7288)) +Referencing [ephemeral](/docs/build/materializations#ephemeral) models is currently not supported (see [feature request](https://github.com/dbt-labs/dbt-core/issues/7288)) ::: ## Configuring Python models @@ -711,3 +715,5 @@ You can also install packages at cluster creation time by [defining cluster prop
+ + \ No newline at end of file diff --git a/website/docs/docs/build/ratio-metrics.md b/website/docs/docs/build/ratio-metrics.md index d70815f140d..97efe0f55bf 100644 --- a/website/docs/docs/build/ratio-metrics.md +++ b/website/docs/docs/build/ratio-metrics.md @@ -6,40 +6,60 @@ sidebar_label: Ratio tags: [Metrics, Semantic Layer] --- -Ratio allows you to create a ratio between two measures. You simply specify a numerator and a denominator measure. Additionally, you can apply a dimensional filter to both the numerator and denominator using a constraint string when computing the metric. +Ratio allows you to create a ratio between two metrics. You simply specify a numerator and a denominator metric. Additionally, you can apply a dimensional filter to both the numerator and denominator using a constraint string when computing the metric. + + The parameters, description, and type for ratio metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `numerator` | The name of the metric used for the numerator, or structure of properties. | Required | +| `denominator` | The name of the metric used for the denominator, or structure of properties. | Required | +| `filter` | Optional filter for the numerator or denominator. | Optional | +| `alias` | Optional alias for the numerator or denominator. | Optional | + +The following displays the complete specification for ratio metrics, along with an example. ```yaml -# Ratio Metric - metrics: - - name: cancellation_rate - owners: - - support@getdbt.com - type: ratio # Ratio metrics create a ratio out of two measures. Define the measures from the semantic model as numerator or denominator - type_params: - numerator: cancellations_usd - denominator: transaction_amount_usd - filter: | # add optional constraint string. This applies to both the numerator and denominator - {{ dimension('country', entity_path=['customer']) }} = 'MX' +metrics: + - name: The metric name # Required + description: the metric description # Optional + type: ratio # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + numerator: The name of the metric used for the numerator, or structure of properties # Required + name: Name of metric used for the numerator # Required + filter: Filter for the numerator # Optional + alias: Alias for the numerator # Optional + denominator: The name of the metric used for the denominator, or structure of properties # Required + name: Name of metric used for the denominator # Required + filter: Filter for the denominator # Optional + alias: Alias for the denominator # Optional +``` + +## Ratio metrics example - - name: enterprise_cancellation_rate - owners: - - support@getdbt.com - type: ratio # Ratio metrics create a ratio out of two measures. Define the measures from the semantic model as numerator or denominator - type_params: - numerator: - name: cancellations_usd - filter: tier = 'enterprise' #constraint only applies to the numerator - denominator: transaction_amount_usd - filter: | # add optional constraint string. This applies to both the numerator and denominator - {{ dimension('country', entity_path=['customer']) }} = 'MX' +```yaml +metrics: + - name: food_order_pct + description: "The food order count as a ratio of the total order count" + label: Food Order Ratio + type: ratio + type_params: + numerator: food_orders + denominator: orders ``` -### Different semantic models +## Ratio metrics using different semantic models -If the numerator and denominator in a ratio metric come from different semantic models, the system will compute their values in subqueries and then join the result set based on common dimensions to calculate the final ratio. Here's an example of the generated SQL for such a ratio metric. +The system will simplify and turn the numerator and denominator in a ratio metric from different semantic models by computing their values in sub-queries. It will then join the result set based on common dimensions to calculate the final ratio. Here's an example of the SQL generated for such a ratio metric. -```SQL +```sql select subq_15577.metric_time as metric_time , cast(subq_15577.mql_queries_created_test as double) / cast(nullif(subq_15582.distinct_query_users, 0) as double) as mql_queries_per_active_user @@ -83,9 +103,9 @@ on ) ``` -### Add filter +## Add filter -Users can define constraints on input measures for a metric by applying a filter directly to the measure, like so: +Users can define constraints on input metrics for a ratio metric by applying a filter directly to the input metric, like so: ```yaml metrics: @@ -97,10 +117,11 @@ metrics: type_params: numerator: name: distinct_purchasers - filter: {{dimension('is_frequent_purchaser')}} + filter: | + {{Dimension('customer__is_frequent_purchaser')}} alias: frequent_purchasers denominator: name: distinct_purchasers ``` -Note the `filter` and `alias` parameters for the measure referenced in the numerator. Use the `filter` parameter to apply a filter to the measure it's attached to. The `alias` parameter is used to avoid naming conflicts in the rendered SQL queries when the same measure is used with different filters. If there are no naming conflicts, the `alias` parameter can be left out. +Note the `filter` and `alias` parameters for the metric referenced in the numerator. Use the `filter` parameter to apply a filter to the metric it's attached to. The `alias` parameter is used to avoid naming conflicts in the rendered SQL queries when the same metric is used with different filters. If there are no naming conflicts, the `alias` parameter can be left out. diff --git a/website/docs/docs/build/semantic-models.md b/website/docs/docs/build/semantic-models.md index 28fccaddb72..bb56bd212e6 100644 --- a/website/docs/docs/build/semantic-models.md +++ b/website/docs/docs/build/semantic-models.md @@ -12,21 +12,45 @@ Semantic models serve as the foundation for defining data in MetricFlow, which p Each semantic model corresponds to a dbt model in your DAG. Therefore you will have one YAML config for each semantic model in your dbt project. You can create multiple semantic models out of a single dbt model, as long as you give each semantic model a unique name. -You can configure semantic models in your dbt project directory in a `YAML` file. Depending on your project structure, you can nest semantic models under a `metrics:` folder or organize them under project sources. Semantic models have 6 components and this page explains the definitions with some examples: +You can configure semantic models in your dbt project directory in a `YAML` file. Depending on your project structure, you can nest semantic models under a `metrics:` folder or organize them under project sources. -1. [Name](#name) — Unique name for the semantic model. -1. [Description](#description) — Includes important details in the description. -1. [Model](#model) — Specifies the dbt model for the semantic model using the `ref` function. -1. [Entities](#entities) — Uses the columns from entities as join keys and indicate their type as primary, foreign, or unique keys with the `type` parameter. -1. [Dimensions](#dimensions) — Different ways to group or slice data for a metric, they can be `time-based` or `categorical`. -1. [Measures](#measures) — Aggregations applied to columns in your data model. They can be the final metric or used as building blocks for more complex metrics. +Semantic models have 6 components and this page explains the definitions with some examples: +| Component | Description | Type | +| --------- | ----------- | ---- | +| [Name](#name) | Unique name for the semantic model | Required | +| [Description](#description) | Includes important details in the description | Optional | +| [Model](#model) | Specifies the dbt model for the semantic model using the `ref` function | Required | +| [Defaults](#defaults) | The defaults for the model, currently only `agg_time_dimension` is supported. | Required | +| [Entities](#entities) | Uses the columns from entities as join keys and indicate their type as primary, foreign, or unique keys with the `type` parameter | Required | +| [Primary Entity](#primary-entity) | If a primary entity exists, this component is Optional. If the semantic model has no primary entity, then this property is required. | Optional | +| [Dimensions](#dimensions) | Different ways to group or slice data for a metric, they can be `time` or `categorical` | Required | +| [Measures](#measures) | Aggregations applied to columns in your data model. They can be the final metric or used as building blocks for more complex metrics | Optional | ## Semantic models components +The complete spec for semantic models is below: + +```yaml +semantic_models: + - name: the_name_of_the_semantic_model ## Required + description: same as always ## Optional + model: ref('some_model') ## Required + defaults: ## Required + agg_time_dimension: dimension_name ## Required if the model contains dimensions + entities: ## Required + - see more information in entities + measures: ## Optional + - see more information in measures section + dimensions: ## Required + - see more information in dimensions section + primary_entity: >- + if the semantic model has no primary entity, then this property is required. #Optional if a primary entity exists, otherwise Required +``` + The following example displays a complete configuration and detailed descriptions of each field: -```yml +```yaml semantic_models: - name: transaction # A semantic model with the name Transactions model: ref('fact_transactions') # References the dbt model named `fact_transactions` @@ -42,7 +66,6 @@ semantic_models: type: foreign expr: customer_id - dimensions: # dimensions are qualitative values such as names, dates, or geographical data. They provide context to metrics and allow "metric by group" data slicing. - name: transaction_date type: time @@ -94,10 +117,33 @@ Includes important details in the description of the semantic model. This descri Specify the dbt model for the semantic model using the [`ref` function](/reference/dbt-jinja-functions/ref). +### Defaults + +Defaults for the semantic model. Currently only `agg_time_dimension`. `agg_time_dimension` represents the default time dimensions for measures. This can be overridden by adding the `agg_time_dimension` key directly to a measure - see [Dimensions](/docs/build/dimensions) for examples. ### Entities To specify the [entities](/docs/build/entities) in your model, use their columns as join keys and indicate their `type` as primary, foreign, or unique keys with the type parameter. +### Primary entity + +MetricFlow requires that all dimensions be tied to an entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. + +You can define a primary entity using the following configs: + +```yaml +semantic_model: + name: bookings_monthly_source + description: bookings_monthly_source + defaults: + agg_time_dimension: ds + model: ref('bookings_monthly_source') + measures: + - name: bookings_monthly + agg: sum + create_metric: true + primary_entity: booking_id + ``` + @@ -117,7 +163,7 @@ This example shows a semantic model with three entities and their entity types: To reference a desired column, use the actual column name from the model in the `name` parameter. You can also use `name` as an alias to rename the column, and the `expr` parameter to refer to the original column name or a SQL expression of the column. -```yml +```yaml entity: - name: transaction type: primary @@ -140,11 +186,11 @@ You can refer to entities (join keys) in a semantic model using the `name` param MetricFlow simplifies this by allowing you to query all metric groups and construct the join during the query. To specify dimensions parameters, include the `name` (either a column or SQL expression) and `type` (`categorical` or `time`). Categorical groups represent qualitative values, while time groups represent dates of varying granularity. -dimensions are identified using the name parameter, just like identifiers. The naming of groups must be unique within a semantic model, but not across semantic models since MetricFlow, uses entities to determine the appropriate groups. +Dimensions are identified using the name parameter, just like identifiers. The naming of groups must be unique within a semantic model, but not across semantic models since MetricFlow, uses entities to determine the appropriate groups. MetricFlow requires all dimensions be tied to a primary entity. :::info For time groups -For semantic models with a measure, you must have a primary time group. +For semantic models with a measure, you must have a [primary time group](/docs/build/dimensions#time). ::: diff --git a/website/docs/docs/build/simple.md b/website/docs/docs/build/simple.md index 0092427699d..7022ca9d007 100644 --- a/website/docs/docs/build/simple.md +++ b/website/docs/docs/build/simple.md @@ -6,7 +6,33 @@ sidebar_label: Simple tags: [Metrics, Semantic Layer] --- -Simple metrics are metrics that directly reference a single measure, without any additional measures involved. +Simple metrics are metrics that directly reference a single measure, without any additional measures involved. They are aggregations over a column in your data platform and can be filtered by one or multiple dimensions. + + The parameters, description, and type for simple metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `measure` | The measure you're referencing. | Required | + +The following displays the complete specification for simple metrics, along with an example. + + +```yaml +metrics: + - name: The metric name # Required + description: the metric description # Optional + type: simple # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + measure: The measure you're referencing # Required + +``` + -``` yaml -metrics: - - name: cancellations - type: simple # Pointers to a measure you created in a data source - type_params: - measure: cancellations_usd # The measure you're creating a proxy of. - # For any metric optionally include a filter string which applies a dimensional filter when computing the metric - filter: | - {{dimension('value')}} > 100 and {{dimension('acquisition', entity_path=['user'])}} +## Simple metrics example + +```yaml + metrics: + - name: customers + description: Count of customers + type: simple # Pointers to a measure you created in a semantic model + label: Count of customers + type_params: + measure: customers # The measure youre creating a proxy of. + - name: large_orders + description: "Order with order values over 20." + type: SIMPLE + label: Large Orders + type_params: + measure: orders + filter: | # For any metric you can optionally include a filter on dimension values + {{Dimension('customer__order_total_dim')}} >= 20 ``` diff --git a/website/docs/docs/build/sl-getting-started.md b/website/docs/docs/build/sl-getting-started.md index ff0e6006921..c0bf59ae0c2 100644 --- a/website/docs/docs/build/sl-getting-started.md +++ b/website/docs/docs/build/sl-getting-started.md @@ -4,129 +4,115 @@ title: Get started with MetricFlow description: "Learn how to create your first semantic model and metric." sidebar_label: Get started with MetricFlow tags: [Metrics, Semantic Layer] +meta: + api_name: dbt Semantic Layer APIs --- -This getting started page recommends a workflow to help you get started creating your first metrics. Here are the following steps you'll take: +import InstallMetricFlow from '/snippets/_sl-install-metricflow.md'; +import CreateModel from '/snippets/_sl-create-semanticmodel.md'; +import DefineMetrics from '/snippets/_sl-define-metrics.md'; +import ConfigMetric from '/snippets/_sl-configure-metricflow.md'; +import TestQuery from '/snippets/_sl-test-and-query-metrics.md'; -- [Create a semantic model](#create-a-semantic-model) -- [Create your metrics](#create-your-metrics) -- [Test and query your metrics](#test-and-query-your-metrics) +This getting started page presents a sample workflow to help you create your first metrics in dbt Cloud or the command-line interface (CLI). It uses the [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) as the project data source and is available for you to use. + +If you prefer, you can create semantic models and metrics for your own dbt project. This page will guide you on how to: + +- [Create a semantic model](#create-a-semantic-model) using MetricFlow +- [Define metrics](#define-metrics) using MetricFlow +- [Test and query metrics locally](#test-and-query-metrics) using MetricFlow +- [Run a production job](#run-a-production-job) in dbt Cloud +- [Set up dbt Semantic Layer](#set-up-dbt-semantic-layer) in dbt Cloud +- [Connect to and query the API](#connect-and-query-api) with dbt Cloud + + +MetricFlow allows users to define metrics in their dbt project whether in dbt Cloud or in dbt Core. dbt Core users can use the [MetricFlow CLI](/docs/build/metricflow-cli) to define metrics in their local dbt Core project. + +However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. ## Prerequisites -- Use the [command line (CLI)](/docs/core/about-the-cli) and have a dbt project and repository set up. - * Note: Support for dbt Cloud and integrations coming soon. -- Your dbt production environment must be on [dbt Core v1.6](/docs/dbt-versions/core) or higher. Support for the development environment coming soon. -- Have a dbt project connected to Snowflake or Postgres. - * Note: Support for BigQuery, Databricks, and Redshift coming soon. - Have an understanding of key concepts in [MetricFlow](/docs/build/about-metricflow), which powers the revamped dbt Semantic Layer. -- Recommended — dbt Labs recommends you install the [MetricFlow CLI package](https://github.com/dbt-labs/metricflow) to test your metrics. +- Have both your production and development environments running dbt version 1.6 or higher. Refer to [upgrade in dbt Cloud](/docs/dbt-versions/upgrade-core-in-cloud) for more info. +- Use Snowflake, BigQuery, Databricks, Redshift, or Postgres (Postgres available in the CLI only, dbt Cloud support coming soon). +- Create a successful run in the environment where you configure the Semantic Layer. + - **Note:** Semantic Layer currently supports the Deployment environment for querying. (_development querying experience coming soon_) +- Set up the [Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) in the integrated tool to import metric definitions. + - **Note:** To access the API and query metrics in downstream tools, you must have a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. dbt Core or Developer accounts can define metrics using [MetricFlow CLI](/docs/build/metricflow-cli) or the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud).
+- Understand [MetricFlow's](/docs/build/about-metricflow) key concepts, which powers the revamped dbt Semantic Layer. :::tip New to dbt or metrics? Try our [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) to help you get started! ::: -## Install MetricFlow +## Create a semantic model -Before you begin, make sure you install the `metricflow` and [dbt adapter](/docs/supported-data-platforms) via PyPI in the CLI. To install them, open the command line interface (CLI) and use the pip install command `pip install "dbt-metricflow[your_adapter_name]"`. + -Note that specifying `[your_adapter_name]` is required. This is because you must install MetricFlow as an extension of a dbt adapter. For example, for a Snowflake adapter, run `pip install "dbt-metricflow[snowflake]"`. - -Currently, the supported adapters are Snowflake and Postgres (BigQuery, Databricks, and Redshift coming soon). +## Define metrics -## Create a semantic model + -MetricFlow, which powers the dbt Semantic Layer, has two main objects: [semantic models](/docs/build/semantic-models) and [metrics](/docs/build/metrics-overview). You can think of semantic models as nodes in your semantic graph, connected via entities as edges. MetricFlow takes semantic models defined in YAML configuration files as inputs and creates a semantic graph that you can use to query metrics. - -This step will guide you through setting up your semantic models, which consists of [entities](/docs/build/entities), [dimensions](/docs/build/dimensions), and [measures](/docs/build/measures). - -1. Name your semantic model, fill in appropriate metadata, and map it to a model in your dbt project. -```yaml -semantic_models: - - name: transactions - description: | - This table captures every transaction starting July 02, 2014. Each row represents one transaction - model: ref('fact_transactions') - ``` - -2. Define your entities. These are the keys in your table that MetricFlow will use to join other semantic models. These are usually columns like `customer_id`, `transaction_id`, and so on. - -```yaml - entities: - - name: transaction - type: primary - expr: id_transaction - - name: customer - type: foreign - expr: id_customer - ``` - -3. Define your dimensions and measures. dimensions are properties of the records in your table that are non-aggregatable. They provide categorical or time-based context to enrich metrics. Measures are the building block for creating metrics. They are numerical columns that MetricFlow aggregates to create metrics. - -```yaml -measures: - - name: transaction_amount_usd - description: The total USD value of the transaction. - agg: sum - dimensions: - - name: is_large - type: categorical - expr: case when transaction_amount_usd >= 30 then true else false end -``` - -:::tip -If you're familiar with writing SQL, you can think of dimensions as the columns you would group by and measures as the columns you would aggregate. -```sql -select - metric_time_day, -- time - country, -- categorical dimension - sum(revenue_usd) -- measure -from - snowflake.fact_transactions -- sql table -group by metric_time_day, country -- dimensions - ``` -::: +## Configure the MetricFlow time spine model -## Create your metrics + -Now that you've created your first semantic model, it's time to define your first metric. MetricFlow supports different metric types like [simple](/docs/build/simple), [ratio](/docs/build/ratio), [cumulative](/docs/build/cumulative), and [derived](/docs/build/derived). You can define metrics in the same YAML files as your semantic models, or create a new file. +## Test and query metrics -The example metric we'll create is a simple metric that refers directly to a measure, based on the `transaction_amount_usd` measure, which will be implemented as a `sum()` function in SQL. + -```yaml ---- -metrics: - - name: transaction_amount_usd - type: simple - type_params: - measure: transaction_amount_usd -``` +## Run a production job + +Before you begin, you must have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment, hosted in North America (cloud.getdbt.com login URL). + +Once you’ve defined metrics in your dbt project, you can perform a job run in your dbt Cloud deployment environment to materialize your metrics. Only the deployment environment is supported for the dbt Semantic Layer at this moment. + +1. Go to **Deploy** in the menu bar +2. Select **Jobs** to re-run the job with the most recent code in the deployment environment. +3. Your metric should appear as a red node in the dbt Cloud IDE and dbt directed acyclic graphs (DAG). + + -Interact and test your metric using the CLI before committing it to your MetricFlow repository. +## Set up dbt Semantic Layer -## Test and query your metrics +import SlSetUp from '/snippets/_new-sl-setup.md'; -Follow these steps to test and query your metrics using MetricFlow: + -1. If you haven't done so already, make sure you [install MetricFlow](#install-metricflow). +## Connect and query API -2. Run `mf --help` to confirm you have MetricFlow installed, and to see the available commands. If you don't have the CLI installed, run `pip install --upgrade "dbt-metricflow[your_adapter_name]"`. For example, if you have a Snowflake adapter, run `pip install --upgrade "dbt-metricflow[snowflake]"`. +You can query your metrics in a JDBC-enabled tool or use existing first-class integrations with the dbt Semantic Layer. -3. Save your files and run `mf validate-configs` to validate the changes before committing them +You must have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment, hosted in North America. (Additional region support coming soon) -4. Run `mf query --metrics --group-by ` to query the metrics and dimensions you want to see in the CLI. +- To learn how to use the JDBC or GraphQL API and what tools you can query it with, refer to the {frontMatter.meta.api_name}.
-5. Verify that the metric values are what you expect. You can view the generated SQL if you enter `--explain` in the CLI. + * To authenticate, you need to [generate a service token](/docs/dbt-cloud-apis/service-tokens) with Semantic Layer Only and Metadata Only permissions. + * Refer to the [SQL query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. -6. Then commit your changes to push them to your git repo. +- To learn more about the sophisticated integrations that connect to the dbt Semantic Layer, refer to [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) for more info. - +If you're encountering some issues when defining your metrics or setting up the dbt Semantic Layer, check out a list of answers to some of the questions or problems you may be experiencing. + +
+ How do I migrate from the legacy Semantic Layer to the new one? +
+
If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
+
+
+
+How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
+
+Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary, however, some components of the dbt Semantic Layer are open source, like dbt-core and MetricFlow.

dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE or the MetricFlow CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users will must be on a dbt Cloud Team or Enterprise plan.
-## Related docs +## Next steps -- [The dbt Semantic Layer: what’s next](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/) blog post - [About MetricFlow](/docs/build/about-metricflow) -- [Semantic models](/docs/build/semantic-models) -- [Metrics](/docs/build/metrics-overview) +- [Build your metrics](/docs/build/build-metrics-intro) +- [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) diff --git a/website/docs/docs/build/tests.md b/website/docs/docs/build/tests.md index 1a40dd42b53..fa78d0df905 100644 --- a/website/docs/docs/build/tests.md +++ b/website/docs/docs/build/tests.md @@ -1,10 +1,12 @@ --- title: "Add tests to your DAG" -sidebar_title: "Tests" +sidebar_label: "Tests" description: "Read this tutorial to learn how to use tests when building in dbt." +search_weight: "heavy" id: "tests" +keywords: + - test, tests, testing, dag --- - ## Related reference docs * [Test command](/reference/commands/test) * [Test properties](/reference/resource-properties/tests) @@ -17,11 +19,7 @@ Tests are assertions you make about your models and other resources in your dbt You can use tests to improve the integrity of the SQL in each model by making assertions about the results generated. Out of the box, you can test whether a specified column in a model only contains non-null values, unique values, or values that have a corresponding value in another model (for example, a `customer_id` for an `order` corresponds to an `id` in the `customers` model), and values from a specified list. You can extend tests to suit business logic specific to your organization – any assertion that you can make about your model in the form of a select query can be turned into a test. - - -* `v0.20.0`: Both types of tests return a set of failing records. Previously, generic/schema tests returned a numeric value representing failures. Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. - - +Both types of tests return a set of failing records. Previously, generic/schema tests returned a numeric value representing failures. Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. Like almost everything in dbt, tests are SQL queries. In particular, they are `select` statements that seek to grab "failing" records, ones that disprove your assertion. If you assert that a column is unique in a model, the test query selects for duplicates; if you assert that a column is never null, the test seeks after nulls. If the test returns zero failing rows, it passes, and your assertion has been validated. @@ -243,12 +241,6 @@ where {{ column_name }} is null ## Storing test failures - - -* `v0.20.0`: Introduced storing test failures in the database - - - Normally, a test query will calculate failures as part of its execution. If you set the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures), dbt will first save the results of a test query to a table in the database, and then query that table to calculate the number of failures. This workflow allows you to query and examine failing records much more quickly in development: diff --git a/website/docs/docs/build/validation.md b/website/docs/docs/build/validation.md index 808d054f021..ad485850d23 100644 --- a/website/docs/docs/build/validation.md +++ b/website/docs/docs/build/validation.md @@ -18,7 +18,7 @@ The code that handles validation [can be found here](https://github.com/dbt-labs ## Validations command -You can run validations from the CLI with the following commands: +You can run validations from the CLI with the following [MetricFlow commands](/docs/build/metricflow-cli): ```bash mf validate-configs diff --git a/website/docs/docs/building-a-dbt-project/building-models/python-models.md b/website/docs/docs/building-a-dbt-project/building-models/python-models.md deleted file mode 100644 index 1aab8ac7a92..00000000000 --- a/website/docs/docs/building-a-dbt-project/building-models/python-models.md +++ /dev/null @@ -1,719 +0,0 @@ ---- -title: "Python models" ---- - -:::info Brand new! - -dbt Core v1.3 included first-ever support for Python models. Note that only [specific data platforms](#specific-data-platforms) support dbt-py models. - -We encourage you to: -- Read [the original discussion](https://github.com/dbt-labs/dbt-core/discussions/5261) that proposed this feature. -- Contribute to [best practices for developing Python models in dbt](https://discourse.getdbt.com/t/dbt-python-model-dbt-py-best-practices/5204 ). -- Weigh in on [next steps for Python models, beyond v1.3](https://github.com/dbt-labs/dbt-core/discussions/5742). -- Join the **#dbt-core-python-models** channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). - -Below, you'll see sections entitled "❓ **Our questions**." We are excited to have released a first narrow set of functionality in v1.3, which will solve real use cases. We also know this is a first step into a much wider field of possibility. We don't pretend to have all the answers. We're excited to keep developing our opinionated recommendations and next steps for product development—and we want your help. Comment in the GitHub discussions; leave thoughts in Slack; bring up dbt + Python in casual conversation with colleagues and friends. -::: - -## About Python models in dbt - -dbt Python ("dbt-py") models will help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. By defining your Python transformations in dbt, they're just models in your project, with all the same capabilities around testing, documentation, and lineage. - - - -Python models are supported in dbt Core 1.3 and above. Learn more about [upgrading your version in dbt Cloud](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions) and [upgrading dbt Core versions](https://docs.getdbt.com/docs/core-versions#upgrading-to-new-patch-versions). - -To read more about Python models, change the docs version to 1.3 or higher in the menu above. - - - - - - - - -```python -import ... - -def model(dbt, session): - - my_sql_model_df = dbt.ref("my_sql_model") - - final_df = ... # stuff you can't write in SQL! - - return final_df -``` - - - - - -```yml -version: 2 - -models: - - name: my_python_model - - # Document within the same codebase - description: My transformation written in Python - - # Configure in ways that feel intuitive and familiar - config: - materialized: table - tags: ['python'] - - # Test the results of my Python transformation - columns: - - name: id - # Standard validation for 'grain' of Python results - tests: - - unique - - not_null - tests: - # Write your own validation logic (in SQL) for Python results - - [custom_generic_test](writing-custom-generic-tests) -``` - - - - - - -The prerequisites for dbt Python models include using an adapter for a data platform that supports a fully featured Python runtime. In a dbt Python model, all Python code is executed remotely on the platform. None of it is run by dbt locally. We believe in clearly separating _model definition_ from _model execution_. In this and many other ways, you'll find that dbt's approach to Python models mirrors its longstanding approach to modeling data in SQL. - -We've written this guide assuming that you have some familiarity with dbt. If you've never before written a dbt model, we encourage you to start by first reading [dbt Models](/docs/build/models). Throughout, we'll be drawing connections between Python models and SQL models, as well as making clear their differences. - -### What is a Python model? - -A dbt Python model is a function that reads in dbt sources or other models, applies a series of transformations, and returns a transformed dataset. DataFrame operations define the starting points, the end state, and each step along the way. - -This is similar to the role of CTEs in dbt SQL models. We use CTEs to pull in upstream datasets, define (and name) a series of meaningful transformations, and end with a final `select` statement. You can run the compiled version of a dbt SQL model to see the data included in the resulting view or table. When you `dbt run`, dbt wraps that query in `create view`, `create table`, or more complex DDL to save its results in the database. - -Instead of a final `select` statement, each Python model returns a final DataFrame. Each DataFrame operation is "lazily evaluated." In development, you can preview its data, using methods like `.show()` or `.head()`. When you run a Python model, the full result of the final DataFrame will be saved as a table in your data warehouse. - -dbt Python models have access to almost all of the same configuration options as SQL models. You can test them, document them, add `tags` and `meta` properties to them, grant access to their results to other users, and so on. You can select them by their name, their file path, their configurations, whether they are upstream or downstream of another model, or whether they have been modified compared to a previous project state. - -### Defining a Python model - -Each Python model lives in a `.py` file in your `models/` folder. It defines a function named **`model()`**, which takes two parameters: -- **`dbt`**: A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG. -- **`session`**: A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames, and to write DataFrames back to tables. In PySpark, by convention, the `SparkSession` is named `spark`, and available globally. For consistency across platforms, we always pass it into the `model` function as an explicit argument called `session`. - -The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame. Via PySpark (Databricks + BigQuery), this can be a Spark, pandas, or pandas-on-Spark DataFrame. For more about choosing between pandas and native DataFrames, see [DataFrame API + syntax](#dataframe-api--syntax). - -When you `dbt run --select python_model`, dbt will prepare and pass in both arguments (`dbt` and `session`). All you have to do is define the function. This is how every single Python model should look: - - - -```python -def model(dbt, session): - - ... - - return final_df -``` - - - - -### Referencing other models - -Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. Use the `dbt.ref()` method within a Python model to read in data from other models (SQL or Python). If you want to read directly from a raw source table, use `dbt.source()`. These methods return DataFrames pointing to the upstream source, model, seed, or snapshot. - - - -```python -def model(dbt, session): - - # DataFrame representing an upstream model - upstream_model = dbt.ref("upstream_model_name") - - # DataFrame representing an upstream source - upstream_source = dbt.source("upstream_source_name", "table_name") - - ... -``` - - - -Of course, you can `ref()` your Python model in downstream SQL models, too: - - - -```sql -with upstream_python_model as ( - - select * from {{ ref('my_python_model') }} - -), - -... -``` - - - -### Configuring Python models - -Just like SQL models, there are three ways to configure Python models: -1. In `dbt_project.yml`, where you can configure many models at once -2. In a dedicated `.yml` file, within the `models/` directory -3. Within the model's `.py` file, using the `dbt.config()` method - -Calling the `dbt.config()` method will set configurations for your model right within your `.py` file, similar to the `{{ config() }}` macro in `.sql` model files: - - - -```python -def model(dbt, session): - - # setting configuration - dbt.config(materialized="table") -``` - - - -There's a limit to how fancy you can get with the `dbt.config()` method. It accepts _only_ literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the [`config` property](resource-properties/config) in a YAML file. - -#### Accessing project context - -dbt Python models don't use Jinja to render compiled code. Python models have limited access to global project contexts compared to SQL models. That context is made available from the `dbt` class, passed in as an argument to the `model()` function. - -Out of the box, the `dbt` class supports: -- Returning DataFrames referencing the locations of other resources: `dbt.ref()` + `dbt.source()` -- Accessing the database location of the current model: `dbt.this()` (also: `dbt.this.database`, `.schema`, `.identifier`) -- Determining if the current model's run is incremental: `dbt.is_incremental` - -It is possible to extend this context by "getting" them via `dbt.config.get()` after they are configured in the [model's config](/reference/model-configs). This includes inputs such as `var`, `env_var`, and `target`. If you want to use those values to power conditional logic in your model, we require setting them through a dedicated `.yml` file config: - - - -```yml -version: 2 - -models: - - name: my_python_model - config: - materialized: table - target_name: "{{ target.name }}" - specific_var: "{{ var('SPECIFIC_VAR') }}" - specific_env_var: "{{ env_var('SPECIFIC_ENV_VAR') }}" -``` - - - -Then, within the model's Python code, use the `dbt.config.get()` function to _access_ values of configurations that have been set: - - - -```python -def model(dbt, session): - target_name = dbt.config.get("target_name") - specific_var = dbt.config.get("specific_var") - specific_env_var = dbt.config.get("specific_env_var") - - orders_df = dbt.ref("fct_orders") - - # limit data in dev - if target_name == "dev": - orders_df = orders_df.limit(500) -``` - - - -### Materializations - -Python models support two materializations: -- `table` -- `incremental` - -Incremental Python models support all the same [incremental strategies](/docs/build/incremental-models#about-incremental_strategy) as their SQL counterparts. The specific strategies supported depend on your adapter. - -Python models can't be materialized as `view` or `ephemeral`. Python isn't supported for non-model resource types (like tests and snapshots). - -For incremental models, like SQL models, you will need to filter incoming tables to only new rows of data: - - - -
- - - -```python -import snowflake.snowpark.functions as F - -def model(dbt, session): - dbt.config( - materialized = "incremental", - unique_key = "id", - ) - df = dbt.ref("upstream_table") - - if dbt.is_incremental: - - # only new rows compared to max in current table - max_from_this = f"select max(updated_at) from {dbt.this}" - df = df.filter(df.updated_at > session.sql(max_from_this).collect()[0][0]) - - # or only rows from the past 3 days - df = df.filter(df.updated_at >= F.dateadd("day", F.lit(-3), F.current_timestamp())) - - ... - - return df -``` - - - -
- -
- - - -```python -import pyspark.sql.functions as F - -def model(dbt, session): - dbt.config( - materialized = "incremental", - unique_key = "id", - ) - df = dbt.ref("upstream_table") - - if dbt.is_incremental: - - # only new rows compared to max in current table - max_from_this = f"select max(updated_at) from {dbt.this}" - df = df.filter(df.updated_at > session.sql(max_from_this).collect()[0][0]) - - # or only rows from the past 3 days - df = df.filter(df.updated_at >= F.date_add(F.current_timestamp(), F.lit(-3))) - - ... - - return df -``` - - - -
- -
- -**Note:** Incremental models are supported on BigQuery/Dataproc for the `merge` incremental strategy. The `insert_overwrite` strategy is not yet supported. - -## Python-specific functionality - -### Defining functions - -In addition to defining a `model` function, the Python model can import other functions or define its own. Here's an example, on Snowpark, defining a custom `add_one` function: - - - -```python -def add_one(x): - return x + 1 - -def model(dbt, session): - dbt.config(materialized="table") - temps_df = dbt.ref("temperatures") - - # warm things up just a little - df = temps_df.withColumn("degree_plus_one", add_one(temps_df["degree"])) - return df -``` - - - -At present, Python functions defined in one dbt model can't be imported and reused in other models. See the ["Code reuse"](#code-reuse) section for the potential patterns we're considering. - -### Using PyPI packages - -You can also define functions that depend on third-party packages, so long as those packages are installed and available to the Python runtime on your data platform. See notes on "Installing Packages" for [specific data warehouses](#specific-data-warehouses). - -In this example, we use the `holidays` package to determine if a given date is a holiday in France. For simplicity and consistency across platforms, the code below uses the pandas API. The exact syntax, and the need to refactor for multi-node processing, still varies. - - - -
- - - -```python -import holidays - -def is_holiday(date_col): - # Chez Jaffle - french_holidays = holidays.France() - is_holiday = (date_col in french_holidays) - return is_holiday - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["holidays"] - ) - - orders_df = dbt.ref("stg_orders") - - df = orders_df.to_pandas() - - # apply our function - # (columns need to be in uppercase on Snowpark) - df["IS_HOLIDAY"] = df["ORDER_DATE"].apply(is_holiday) - - # return final dataset (Pandas DataFrame) - return df -``` - - - -
- -
- - - -```python -import holidays - -def is_holiday(date_col): - # Chez Jaffle - french_holidays = holidays.France() - is_holiday = (date_col in french_holidays) - return is_holiday - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["holidays"] - ) - - orders_df = dbt.ref("stg_orders") - - df = orders_df.to_pandas_on_spark() # Spark 3.2+ - # df = orders_df.toPandas() in earlier versions - - # apply our function - df["is_holiday"] = df["order_date"].apply(is_holiday) - - # convert back to PySpark - df = df.to_spark() # Spark 3.2+ - # df = session.createDataFrame(df) in earlier versions - - # return final dataset (PySpark DataFrame) - return df -``` - - - -
- -
- -#### Configuring packages - -We encourage you to explicitly configure required packages and versions so dbt can track them in project metadata. This configuration is required for the implementation on some platforms. If you need specific versions of packages, specify them. - - - -```python -def model(dbt, session): - dbt.config( - packages = ["numpy==1.23.1", "scikit-learn"] - ) -``` - - - - - -```yml -version: 2 - -models: - - name: my_python_model - config: - packages: - - "numpy==1.23.1" - - scikit-learn -``` - - - -#### UDFs - -You can use the `@udf` decorator or `udf` function to define an "anonymous" function and call it within your `model` function's DataFrame transformation. This is a typical pattern for applying more complex functions as DataFrame operations, especially if those functions require inputs from third-party packages. -- [Snowpark Python: Creating UDFs](https://docs.snowflake.com/en/developer-guide/snowpark/python/creating-udfs.html) -- [PySpark functions: udf](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html) - - - -
- - - -```python -import snowflake.snowpark.types as T -import snowflake.snowpark.functions as F -import numpy - -def register_udf_add_random(): - add_random = F.udf( - # use 'lambda' syntax, for simple functional behavior - lambda x: x + numpy.random.normal(), - return_type=T.FloatType(), - input_types=[T.FloatType()] - ) - return add_random - -def model(dbt, session): - - dbt.config( - materialized = "table", - packages = ["numpy"] - ) - - temps_df = dbt.ref("temperatures") - - add_random = register_udf_add_random() - - # warm things up, who knows by how much - df = temps_df.withColumn("degree_plus_random", add_random("degree")) - return df -``` - - - -**Note:** Due to a Snowpark limitation, it is not currently possible to register complex named UDFs within stored procedures, and therefore dbt Python models. We are looking to add native support for Python UDFs as a project/DAG resource type in a future release. For the time being, if you want to create a "vectorized" Python UDF via the Batch API, we recommend either: -- Writing [`create function`](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html) inside a SQL macro, to run as a hook or run-operation -- [Registering from a staged file](https://docs.snowflake.com/ko/developer-guide/snowpark/reference/python/_autosummary/snowflake.snowpark.udf.html#snowflake.snowpark.udf.UDFRegistration.register_from_file) within your Python model code - -
- -
- - - -```python -from pyspark.sql.types as T -import pyspark.sql.functions as F -import numpy - -# use a 'decorator' for more readable code -@F.udf(returnType=T.DoubleType()) -def add_random(x): - random_number = numpy.random.normal() - return x + random_number - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["numpy"] - ) - - temps_df = dbt.ref("temperatures") - - # warm things up, who knows by how much - df = temps_df.withColumn("degree_plus_random", add_random("degree")) - return df -``` - - - -
- -
- -#### Code reuse - -Currently, you cannot import or reuse Python functions defined in one dbt model, in other models. This is something we'd like dbt to support. There are two patterns we're considering: -1. Creating and registering **"named" UDFs**. This process is different across data platforms and has some performance limitations. (Snowpark does support ["vectorized" UDFs](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html): pandas-like functions that you can execute in parallel.) -2. Using **private Python packages**. In addition to importing reusable functions from public PyPI packages, many data platforms support uploading custom Python assets and registering them as packages. The upload process looks different across platforms, but your code’s actual `import` looks the same. - -:::note ❓ Our questions - -- Should dbt have a role in abstracting over UDFs? Should dbt support a new type of DAG node, `function`? Would the primary use case be code reuse across Python models or defining Python-language functions that can be called from SQL models? -- How can dbt help users when uploading or initializing private Python assets? Is this a new form of `dbt deps`? -- How can dbt support users who want to test custom functions? If defined as UDFs: "unit testing" in the database? If "pure" functions in packages: encourage adoption of `pytest`? - -💬 Discussion: ["Python models: package, artifact/object storage, and UDF management in dbt"](https://github.com/dbt-labs/dbt-core/discussions/5741) -::: - -### DataFrame API and syntax - -Over the past decade, most people writing data transformations in Python have adopted DataFrame as their common abstraction. dbt follows this convention by returning `ref()` and `source()` as DataFrames, and it expects all Python models to return a DataFrame. - -A DataFrame is a two-dimensional data structure (rows and columns). It supports convenient methods for transforming that data, creating new columns from calculations performed on existing columns. It also offers convenient ways for previewing data while developing locally or in a notebook. - -That's about where the agreement ends. There are numerous frameworks with their own syntaxes and APIs for DataFrames. The [pandas](https://pandas.pydata.org/docs/) library offered one of the original DataFrame APIs, and its syntax is the most common to learn for new data professionals. Most newer DataFrame APIs are compatible with pandas-style syntax, though few can offer perfect interoperability. This is true for Snowpark and PySpark, which have their own DataFrame APIs. - -When developing a Python model, you will find yourself asking these questions: - -**Why pandas?** It's the most common API for DataFrames. It makes it easy to explore sampled data and develop transformations locally. You can “promote” your code as-is into dbt models and run it in production for small datasets. - -**Why _not_ pandas?** Performance. pandas runs "single-node" transformations, which cannot benefit from the parallelism and distributed computing offered by modern data warehouses. This quickly becomes a problem as you operate on larger datasets. Some data platforms support optimizations for code written using pandas' DataFrame API, preventing the need for major refactors. For example, ["pandas on PySpark"](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html) offers support for 95% of pandas functionality, using the same API while still leveraging parallel processing. - -:::note ❓ Our questions -- When developing a new dbt Python model, should we recommend pandas-style syntax for rapid iteration and then refactor? -- Which open source libraries provide compelling abstractions across different data engines and vendor-specific APIs? -- Should dbt attempt to play a longer-term role in standardizing across them? - -💬 Discussion: ["Python models: the pandas problem (and a possible solution)"](https://github.com/dbt-labs/dbt-core/discussions/5738) -::: - -### Limitations - -Python models have capabilities that SQL models do not. They also have some drawbacks compared to SQL models: - -- **Time and cost.** Python models are slower to run than SQL models, and the cloud resources that run them can be more expensive. Running Python requires more general-purpose compute. That compute might sometimes live on a separate service or architecture from your SQL models. **However:** We believe that deploying Python models via dbt—with unified lineage, testing, and documentation—is, from a human standpoint, **dramatically** faster and cheaper. By comparison, spinning up separate infrastructure to orchestrate Python transformations in production and different tooling to integrate with dbt is much more time-consuming and expensive. -- **Syntax differences** are even more pronounced. Over the years, dbt has done a lot, via dispatch patterns and packages such as `dbt_utils`, to abstract over differences in SQL dialects across popular data warehouses. Python offers a **much** wider field of play. If there are five ways to do something in SQL, there are 500 ways to write it in Python, all with varying performance and adherence to standards. Those options can be overwhelming. As the maintainers of dbt, we will be learning from state-of-the-art projects tackling this problem and sharing guidance as we develop it. -- **These capabilities are very new.** As data warehouses develop new features, we expect them to offer cheaper, faster, and more intuitive mechanisms for deploying Python transformations. **We reserve the right to change the underlying implementation for executing Python models in future releases.** Our commitment to you is around the code in your model `.py` files, following the documented capabilities and guidance we're providing here. - -As a general rule, if there's a transformation you could write equally well in SQL or Python, we believe that well-written SQL is preferable: it's more accessible to a greater number of colleagues, and it's easier to write code that's performant at scale. If there's a transformation you _can't_ write in SQL, or where ten lines of elegant and well-annotated Python could save you 1000 lines of hard-to-read Jinja-SQL, Python is the way to go. - -## Specific data platforms - -In their initial launch, Python models are supported on three of the most popular data platforms: Snowflake, Databricks, and BigQuery/GCP (via Dataproc). Both Databricks and GCP's Dataproc use PySpark as the processing framework. Snowflake uses its own framework, Snowpark, which has many similarities to PySpark. - - - -
- -**Additional setup:** You will need to [acknowledge and accept Snowflake Third Party Terms](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#getting-started) to use Anaconda packages. - -**Installing packages:** Snowpark supports several popular packages via Anaconda. The complete list is at https://repo.anaconda.com/pkgs/snowflake/. Packages are installed at the time your model is being run. Different models can have different package dependencies. If you are using third-party packages, Snowflake recommends using a dedicated virtual warehouse for best performance rather than one with many concurrent users. - -**About "sprocs":** dbt submits Python models to run as "stored procedures," which some people call "sprocs" for short. By default, dbt will create a named sproc containing your model's compiled Python code, and then "call" it to execute. Snowpark has a Private Preview feature for "temporary" or "anonymous" stored procedures ([docs](https://docs.snowflake.com/en/LIMITEDACCESS/call-with.html)), which are faster and leave a cleaner query history. If this feature is enabled for your account, you can switch it on for your models by configuring `use_anonymous_sproc: True`. We plan to switch this on for all dbt + Snowpark Python models in a future release. - - - -```yml -# I asked Snowflake Support to enable this Private Preview feature, -# and now my dbt-py models run even faster! -models: - use_anonymous_sproc: True -``` - - - -**Docs:** ["Developer Guide: Snowpark Python"](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html) - -
- -
- -**Submission methods:** Databricks supports a few different mechanisms to submit PySpark code, each with relative advantages. Some are better for supporting iterative development, while others are better for supporting lower-cost production deployments. The options are: -- `all_purpose_cluster` (default): dbt will run your Python model using the cluster ID configured as `cluster` in your connection profile or for this specific model. These clusters are more expensive but also much more responsive. We recommend using an interactive all-purpose cluster for quicker iteration in development. - - `create_notebook: True`: dbt will upload your model's compiled PySpark code to a notebook in the namespace `/Shared/dbt_python_model/{schema}`, where `{schema}` is the configured schema for the model, and execute that notebook to run using the all-purpose cluster. The appeal of this approach is that you can easily open the notebook in the Databricks UI for debugging or fine-tuning right after running your model. Remember to copy any changes into your dbt `.py` model code before re-running. - - `create_notebook: False` (default): dbt will use the [Command API](https://docs.databricks.com/dev-tools/api/1.2/index.html#run-a-command), which is slightly faster. -- `job_cluster`: dbt will upload your model's compiled PySpark code to a notebook in the namespace `/Shared/dbt_python_model/{schema}`, where `{schema}` is the configured schema for the model, and execute that notebook to run using a short-lived jobs cluster. For each Python model, Databricks will need to spin up the cluster, execute the model's PySpark transformation, and then spin down the cluster. As such, job clusters take longer before and after model execution, but they're also less expensive, so we recommend these for longer-running Python models in production. To use the `job_cluster` submission method, your model must be configured with `job_cluster_config`, which defines key-value properties for `new_cluster`, as defined in the [JobRunsSubmit API](https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit). - -You can configure each model's `submission_method` in all the standard ways you supply configuration: - -```python -def model(dbt, session): - dbt.config( - submission_method="all_purpose_cluster", - create_notebook=True, - cluster_id="abcd-1234-wxyz" - ) - ... -``` -```yml -version: 2 -models: - - name: my_python_model - config: - submission_method: job_cluster - job_cluster_config: - spark_version: ... - node_type_id: ... -``` -```yml -# dbt_project.yml -models: - project_name: - subfolder: - # set defaults for all .py models defined in this subfolder - +submission_method: all_purpose_cluster - +create_notebook: False - +cluster_id: abcd-1234-wxyz -``` - -If not configured, `dbt-spark` will use the built-in defaults: the all-purpose cluster (based on `cluster` in your connection profile) without creating a notebook. The `dbt-databricks` adapter will default to the cluster configured in `http_path`. We encourage explicitly configuring the clusters for Python models in Databricks projects. - -**Installing packages:** When using all-purpose clusters, we recommend installing packages which you will be using to run your Python models. - -**Docs:** -- [PySpark DataFrame syntax](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) -- [Databricks: Introduction to DataFrames - Python](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html) - -
- -
- -The `dbt-bigquery` adapter uses a service called Dataproc to submit your Python models as PySpark jobs. That Python/PySpark code will read from your tables and views in BigQuery, perform all computation in Dataproc, and write the final result back to BigQuery. - -**Submission methods.** Dataproc supports two submission methods: `serverless` and `cluster`. Dataproc Serverless does not require a ready cluster, which saves on hassle and cost—but it is slower to start up, and much more limited in terms of available configuration. For example, Dataproc Serverless supports only a small set of Python packages, though it does include `pandas`, `numpy`, and `scikit-learn`. (See the full list [here](https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build), under "The following packages are installed in the default image"). Whereas, by creating a Dataproc Cluster in advance, you can fine-tune the cluster's configuration, install any PyPI packages you want, and benefit from faster, more responsive runtimes. - -Use the `cluster` submission method with dedicated Dataproc clusters you or your organization manage. Use the `serverless` submission method to avoid managing a Spark cluster. The latter may be quicker for getting started, but both are valid for production. - -**Additional setup:** -- Create or use an existing [Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) -- Enable Dataproc APIs for your project + region -- If using the `cluster` submission method: Create or use an existing [Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) with the [Spark BigQuery connector initialization action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/connectors#bigquery-connectors). (Google recommends copying the action into your own Cloud Storage bucket, rather than using the example version shown in the screenshot below.) - - - -The following configurations are needed to run Python models on Dataproc. You can add these to your [BigQuery profile](/reference/warehouse-setups/bigquery-setup#running-python-models-on-dataproc), or configure them on specific Python models: -- `gcs_bucket`: Storage bucket to which dbt will upload your model's compiled PySpark code. -- `dataproc_region`: GCP region in which you have enabled Dataproc (for example `us-central1`) -- `dataproc_cluster_name`: Name of Dataproc cluster to use for running Python model (executing PySpark job). Only required if `submission_method: cluster`. - -```python -def model(dbt, session): - dbt.config( - submission_method="cluster", - dataproc_cluster_name="my-favorite-cluster" - ) - ... -``` -```yml -version: 2 -models: - - name: my_python_model - config: - submission_method: serverless -``` - -Any user or service account that runs dbt Python models will need the following permissions, in addition to permissions needed for BigQuery ([docs](https://cloud.google.com/dataproc/docs/concepts/iam/iam)): -``` -dataproc.clusters.use -dataproc.jobs.create -dataproc.jobs.get -dataproc.operations.get -storage.buckets.get -storage.objects.create -storage.objects.delete -``` - -**Installing packages:** If you are using a Dataproc Cluster (as opposed to Dataproc Serverless), you can add third-party packages while creating the cluster. - -Google recommends installing Python packages on Dataproc clusters via initialization actions: -- [How initialization actions are used](https://github.com/GoogleCloudDataproc/initialization-actions/blob/master/README.md#how-initialization-actions-are-used) -- [Actions for installing via `pip` or `conda`](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/python) - -You can also install packages at cluster creation time by [defining cluster properties](https://cloud.google.com/dataproc/docs/tutorials/python-configuration#image_version_20): `dataproc:pip.packages` or `dataproc:conda.packages`. - - - -**Docs:** -- [Dataproc overview](https://cloud.google.com/dataproc/docs/concepts/overview) -- [PySpark DataFrame syntax](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) - -
- -
- -
diff --git a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md index f301dfce34b..f1d8b32cdb1 100644 --- a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md +++ b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md @@ -63,7 +63,7 @@ link="/docs/cloud/dbt-cloud-ide/develop-in-the-cloud" + +
*These features are available on [selected plans](https://www.getdbt.com/pricing/). diff --git a/website/docs/docs/cloud/about-cloud/browsers.md b/website/docs/docs/cloud/about-cloud/browsers.md index 4a04f70171b..2fc5a8b4b4d 100644 --- a/website/docs/docs/cloud/about-cloud/browsers.md +++ b/website/docs/docs/cloud/about-cloud/browsers.md @@ -22,3 +22,8 @@ You may still be able to access and use dbt Cloud even without using the latest To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. ::: +### Browser sessions + +A session is a period of time during which you’re signed in to a dbt Cloud account from a browser. If you close your browser, it will end your session and log you out. You'll need to log in again the next time you try to access dbt Cloud. + +If you've logged in using [SSO](/docs/cloud/manage-access/sso-overview) or [OAuth](/docs/cloud/git/connect-github#personally-authenticate-with-github), you can customize your maximum session duration, which might vary depending on your identity provider (IdP). diff --git a/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md b/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md index bc8c180f2fd..caeb0203a5e 100644 --- a/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md +++ b/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md @@ -11,10 +11,17 @@ dbt Cloud is [hosted](/docs/cloud/about-cloud/architecture) in multiple regions | Region | Location | Access URL | IP addresses | Developer plan | Team plan | Enterprise plan | |--------|----------|------------|--------------|----------------|-----------|-----------------| -| North America [^1] | AWS us-east-1 (N. Virginia) | cloud.getdbt.com | 52.45.144.63
54.81.134.249
52.22.161.231 | ✅ | ✅ | ✅ | +| North America multi-tenant [^1] | AWS us-east-1 (N. Virginia) | cloud.getdbt.com | 52.45.144.63
54.81.134.249
52.22.161.231 | ✅ | ✅ | ✅ | +| North America Cell 1 [^1] | AWS use-east-1 (N.Virginia) | {account prefix}.us1.dbt.com | [Located in Account Settings](#locating-your-dbt-cloud-ip-addresses) | ❌ | ❌ | ❌ | | EMEA [^1] | AWS eu-central-1 (Frankfurt) | emea.dbt.com | 3.123.45.39
3.126.140.248
3.72.153.148 | ❌ | ❌ | ✅ | | APAC [^1] | AWS ap-southeast-2 (Sydney)| au.dbt.com | 52.65.89.235
3.106.40.33
13.239.155.206
| ❌ | ❌ | ✅ | | Virtual Private dbt or Single tenant | Customized | Customized | Ask [Support](/community/resources/getting-help#dbt-cloud-support) for your IPs | ❌ | ❌ | ✅ | [^1]: These regions support [multi-tenant](/docs/cloud/about-cloud/tenancy) deployment environments hosted by dbt Labs. + +### Locating your dbt Cloud IP addresses + +There are two ways to view your dbt Cloud IP addresses: +- If no projects exist in the account, create a new project, and the IP addresses will be displayed during the **Configure your environment** steps. +- If you have an existing project, navigate to **Account Settings** and ensure you are in the **Projects** pane. Click on a project name, and the **Project Settings** window will open. Locate the **Connection** field and click on the name. Scroll down to the **Settings**, and the first text block lists your IP addresses. diff --git a/website/docs/docs/cloud/billing.md b/website/docs/docs/cloud/billing.md new file mode 100644 index 00000000000..61251f6e41d --- /dev/null +++ b/website/docs/docs/cloud/billing.md @@ -0,0 +1,193 @@ +--- +title: "Billing" +id: billing +description: "dbt Cloud billing information." +sidebar_label: Billing +--- + +dbt Cloud offers a variety of [plans and pricing](https://www.getdbt.com/pricing/) to fit your organization’s needs. With flexible billing options that appeal to large enterprises and small businesses and [server availability](/docs/cloud/about-cloud/regions-ip-addresses) worldwide, dbt Cloud is the fastest and easiest way to begin transforming your data. + +## How does dbt Cloud pricing work? + +As a customer, you pay for the number of seats you have and the amount of usage consumed each month. Usage is based on the number of Successful Models Built, and seats are billed primarily on the amount of Developer licenses purchased. All billing computations are conducted in Coordinated Universal Time (UTC). + +### What counts as a Successful Model Built? + +dbt Cloud considers a Successful Model Built as any model that is successfully built via a run through dbt Cloud’s orchestration functionality in a dbt Cloud deployment environment. Models are counted when built and run. This includes any jobs run via dbt Cloud's scheduler, CI builds (jobs triggered by pull requests), runs kicked off via the dbt Cloud API, and any successor dbt Cloud tools with similar functionality. This also includes models that are successfully built even when a run may fail to complete. For example, you may have a job that contains 100 models and on one of its runs, 51 models are successfully built and then the job fails. In this situation, only 51 models would be counted. + +Any models built in a dbt Cloud development environment (for example, via the IDE) do not count towards your usage. Tests, seeds, ephemeral models, and snapshots also do not count. + + +### What counts as a seat license? + +There are three types of possible seat licenses: + +* **Developer** — for roles and permissions that require interaction with the dbt Cloud environment day-to-day. +* **Read-Only** — for access to view certain documents and reports. +* **IT** — for access to specific features related to account management (for example, configuring git integration). + +### Viewing usage in the product + +Viewing usage in the product is restricted to specific roles: + +* Team plan — Owner group +* Enterprise plan — Account and billing admin roles + +For an account-level view of usage, if you have access to the **Billing** and **Usage** pages, you can see an estimate of the usage for the month. In the Billing page of the **Account Settings**, you can see how your account tracks against its usage. You can also see which projects are building the most models. + +As a Team and Developer plan user, you can see how the account is tracking against the included models built. As an Enterprise plan user, you can see how much you have drawn down from your annual commit and how much remains. + +On each Project Home page, any user with access to that project can see how many models are built each month. From there, additional details on top jobs by models built can be found on each Environment page. + +In addition, you can look at the Job Details page's Insights tab to show how many models are being built per month for that particular job and which models are taking the longest to build. + +Any usage data is only an estimate of your usage, and there may be a delay in showing usage data in the product — your final usage for the month will be visible on your monthly statements (statements applicable to Team and Enterprise plans). + + +## Plans and Billing + +dbt Cloud offers several [plans](https://www.getdbt.com/pricing) with different features that meet your needs. We may make changes to our plan details from time to time. We'll always let you know in advance, so you can be prepared. The following section explains how billing works in each plan. + +### Developer plan billing + +Developer plans are free and include one Developer license and 3,000 models each month. Models are refreshed at the beginning of each calendar month. If you exceed 3,000 models, any subsequent runs will be canceled until models are refreshed or until you upgrade to a paid plan. The rest of the dbt Cloud platform is still accessible, and no work will be lost. + +All included successful models built numbers above reflect our most current pricing and packaging. Based on your usage terms when you signed up for the Developer Plan, the included model entitlements may be different from what’s reflected above. + + +### Team plan billing + +Team customers pay monthly via credit card for seats and usage, and accounts include 15,000 models monthly. Seats are charged upfront at the beginning of the month. If you add seats during the month, seats will be prorated and charged on the same day. Seats removed during the month will be reflected on the next invoice and are not eligible for refunds. You can change the credit card information and the number of seats from the billings section anytime. Accounts will receive one monthly invoice that includes the upfront charge for the seats and the usage charged in arrears from the previous month. + +Usage is calculated and charged in arrears for the previous month. If you exceed 15,000 models in any month, you will be billed for additional usage on your next invoice. Additional use is billed at the rates on our [pricing page](https://www.getdbt.com/pricing). + + +Included models that are not consumed do not roll over to future months. You can estimate your bill with a simple formula: + +`($100 x number of developer seats) + ((models built - 15,000) x $0.01)` + +All included successful models built numbers above reflect our most current pricing and packaging. Based on your usage terms when you signed up for the Team Plan, the included model entitlements may be different from what’s reflected above. + +:::note Legacy pricing plans + +Customers who purchased the dbt Cloud Team plan before August 11, 2023, remain on a legacy pricing plan as long as their account is in good standing. The legacy pricing plan is based on seats and includes unlimited models subject to reasonable use. dbt Labs may institute use limits if reasonable use is exceeded. Additional features, upgrades, or updates may be subject to separate charges. Any changes to your current plan pricing will be communicated in advance according to our Terms of Use. + +::: + +### Enterprise plan billing + +As an Enterprise customer, you pay annually via invoice, monthly in arrears for additional usage (if applicable), and may benefit from negotiated usage rates. Please refer to your order form or contract for your specific pricing details, or [contact the account team](https://www.getdbt.com/contact-demo) with any questions. + +## Managing usage + +From anywhere in the dbt Cloud account, click the **gear icon** and click **Account settings**. The **Billing** option will be on the left side menu under the **Account Settings** heading. Here, you can view individual available plans and the features provided for each. + +### Usage notifications + +Every plan automatically sends email alerts when 75%, 90%, and 100% of usage estimates have been reached. In the Team plan, all users within the Owner group will receive alerts. In Enterprise plans, all users with the Account Admin and Billing Admin permission sets will receive alerts. Users cannot opt out of these emails. If you would like additional users to receive these alert emails, please provide them with the applicable permissions mentioned above. Note that your usage may already be higher than the percentage indicated in the alert due to your usage pattern and minor latency times. + +### How do I stop usage from accruing? + +There are 2 options to disable models from being built and charged: + +1. Open the **Job Settings** of every job and navigate to the **Triggers** section. Disable the **Run on Schedule** and set the **Continuous Integration** feature **Run on Pull Requests?** to **No**. Check your workflows to ensure that you are not triggering any runs via the dbt Cloud API. This option will enable you to keep your dbt Cloud jobs without building more models. +2. Alternatively, you can delete some or all of your dbt Cloud jobs. This will ensure that no runs are kicked off, but you will permanently lose your job(s). + + +## Optimize costs in dbt Cloud + +dbt Cloud offers ways to optimize your model’s built usage and warehouse costs. + +### Best practices for optimizing successful models built + +When thinking of ways to optimize your costs from successful models built, there are methods to reduce those costs while still adhering to best practices. To ensure that you are still utilizing tests and rebuilding views when logic is changed, it's recommended to implement a combination of the best practices that fit your needs. More specifically, if you decide to exclude views from your regularly scheduled dbt Cloud job runs, it's imperative that you set up a merge job (with a link to the section) to deploy updated view logic when changes are detected. + +#### Exclude views in a dbt Cloud job + +Many dbt Cloud users utilize views, which don’t always need to be rebuilt every time you run a job. For any jobs that contain views that _do not_ include macros that dynamically generate code (for example, case statements) based on upstream tables and also _do not_ have tests, you can implement these steps: + +1. Go to your current production deployment job in dbt Cloud. +2. Modify your command to include: `-exclude config.materialized:view`. +3. Save your job changes. + +If you have views that contain macros with case statements based on upstream tables, these will need to be run each time to account for new values. If you still need to test your views with each run, follow the [Exclude views while still running tests](#exclude-views-while-running-tests) best practice to create a custom selector. + +#### Exclude views while running tests + +Running tests for views in every job run can help keep data quality intact and save you from the need to rerun failed jobs. To exclude views from your job run while running tests, you can follow these steps to create a custom [selector](https://docs.getdbt.com/reference/node-selection/yaml-selectors) for your job command. + +1. Open your dbt project in the dbt Cloud IDE. +2. Add a file called `selectors.yml` in your top-level project folder. +3. In the file, add the following code: + + ```yaml + selectors: + - name: skip_views_but_test_views + description: > + A default selector that will exclude materializing views + without skipping tests on views. + default: true + definition: + union: + - union: + - method: path + value: "*" + - exclude: + - method: config.materialized + value: view + - method: resource_type + value: test + + ``` + +4. Save the file and commit it to your project. +5. Modify your dbt Cloud jobs to include `--selector skip_views_but_test_views`. + +#### Build only changed views + +If you want to ensure that you're building views whenever the logic is changed, create a merge job that gets triggered when code is merged into main: + +1. Ensure you have a [CI job setup](/docs/deploy/ci-jobs) in your environment. +2. Create a new [deploy job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) and call it “Merge Job". +3. Set the  **Environment** to your CI environment. Refer to [Types of environments](/docs/deploy/deploy-environments#types-of-environments) for more details. +4. Set **Commands** to: `dbt run -s state:modified+`. + Executing `dbt build` in this context is unnecessary because the CI job was used to both run and test the code that just got merged into main. +5. Under the **Execution Settings**, select the default production job to compare changes against: + - **Defer to a previous run state** — Select the “Merge Job” you created so the job compares and identifies what has changed since the last merge. +6. In your dbt project, follow the steps in [Run a dbt Cloud job on merge](/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge) to create a script to trigger the dbt Cloud API to run your job after a merge happens within your git repository or watch this [video](https://www.loom.com/share/e7035c61dbed47d2b9b36b5effd5ee78?sid=bcf4dd2e-b249-4e5d-b173-8ca204d9becb). + +The purpose of the merge job is to: + +- Immediately deploy any changes from PRs to production. +- Ensure your production views remain up-to-date with how they’re defined in your codebase while remaining cost-efficient when running jobs in production. + +The merge action will optimize your cloud data platform spend and shorten job times, but you’ll need to decide if making the change is right for your dbt project. + +### Rework inefficient models + +#### Job Insights tab + +To reduce your warehouse spend, you can identify what models, on average, are taking the longest to build in the **Job** page under the **Insights** tab. This chart looks at the average run time for each model based on its last 20 runs. Any models that are taking longer than anticipated to build might be prime candidates for optimization, which will ultimately reduce cloud warehouse spending. + +#### Model Timing tab + +To understand better how long each model takes to run within the context of a specific run, you can look at the **Model Timing** tab. Select the run of interest on the **Run History** page to find the tab. On that **Run** page, click **Model Timing**. + +Once you've identified which models could be optimized, check out these other resources that walk through how to optimize your work: +* [Build scalable and trustworthy data pipelines with dbt and BigQuery](https://services.google.com/fh/files/misc/dbt_bigquery_whitepaper.pdf) +* [Best Practices for Optimizing Your dbt and Snowflake Deployment](https://www.snowflake.com/wp-content/uploads/2021/10/Best-Practices-for-Optimizing-Your-dbt-and-Snowflake-Deployment.pdf) +* [How to optimize and troubleshoot dbt models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks) + +## FAQs + +* What happens if I need more than 8 seats on the Team plan? +_If you need more than 8 developer seats, select the Contact Sales option from the billing settings to talk to our sales team about an Enterprise plan._ + +* What if I go significantly over my included free models on the Team or Developer plan? +_Consider upgrading to a Team or Enterprise plan. Team plans include more models and allow you to exceed the monthly usage limit. Enterprise accounts are supported by a dedicated account management team and offer annual plans, custom configurations, and negotiated usage rates._ + +* I want to upgrade my plan. Will all of my work carry over? +_Yes. Your dbt Cloud account will be upgraded without impacting your existing projects and account settings._ + +* How do I determine the right plan for me? + _The best option is to consult with our sales team. They'll help you figure out what is right for your needs. We also offer a free two-week trial on the Team plan._ diff --git a/website/docs/docs/cloud/cloud-cli-installation.md b/website/docs/docs/cloud/cloud-cli-installation.md new file mode 100644 index 00000000000..68a8ef365d6 --- /dev/null +++ b/website/docs/docs/cloud/cloud-cli-installation.md @@ -0,0 +1,110 @@ +--- +title: Installing the dbt Cloud CLI (Alpha) +id: cloud-cli-installation +description: "Instructions for installing and configuring dbt Cloud CLI" +--- + +:::warning Alpha functionality + +The following installation instructions are for the dbt Cloud CLI, currently in Alpha (actively in development and being tested). + +These instructions are not intended for general audiences at this time. + +::: + +## Installing dbt Cloud CLI + +### Install and update with Brew on MacOS (recommended) + +1. Install the dbt Cloud CLI: + +```bash +brew tap dbt-labs/dbt-cli +brew install dbt-cloud-cli +``` + +2. Verify the installation by requesting your homebrew installation path (not your dbt core installs). If the `which dbt` command returns nothing, then you should modify your PATH in `~.zshrc` or create an alias. + +```bash +which dbt +dbt --help +``` + +### Manually install (Windows and Linux) + +1. Download the latest release for your platform from [GitHub](https://github.com/dbt-labs/dbt-cli/releases). +2. Add the `dbt` executable to your path. +3. Move to a directory with a dbt project, and create a `dbt_cloud.yml` file containing your `project-id` from dbt Cloud. +4. Invoke `dbt --help` from your terminal to see a list of supported commands. + +#### Updating your dbt Cloud installation (Windows + Linux) + +Follow the same process in [Installing dbt Cloud CLI](#manually-install-windows-only) and replace the existing `dbt` executable with the new one. You should not have to go through the security steps again. + +## Setting up the CLI + +The following instructions are for setting up the dbt Cloud CLI. + +1. Ensure that you have created a project in [dbt Cloud](https://cloud.getdbt.com/). + +2. Ensure that your personal [development credentials](https://cloud.getdbt.com/settings/profile/credentials) are set on the project. + +3. Navigate to [your profile](https://cloud.getdbt.com/settings/profile) and enable the **Beta** flag under **Experimental Features.** + +4. Create an environment variable with your [dbt Cloud API key](https://cloud.getdbt.com/settings/profile#api-access): + +```bash +vi ~/.zshrc + +# dbt Cloud CLI +export DBT_CLOUD_API_KEY="1234" # Replace "1234" with your API key +``` + +5. Load the new environment variable. Note: You may need to reactivate your Python virtual environment after sourcing your shell's dot file. Alternatively, restart your shell instead of sourcing the shell's dot file + +```bash +source ~/.zshrc +``` + +6. Navigate to a dbt project + +```bash +cd ~/dbt-projects/jaffle_shop +``` + +7. Create a `dbt_cloud.yml` in the root project directory. The file is required to have a `project-id` field with a valid [project ID](#glossary). Enter the following commands: + +```bash +pwd # Input +/Users/user/dbt-projects/jaffle_shop # Output +``` + +```bash +echo "project-id: ''" > dbt_cloud.yml # Input +``` + +```bash +cat dbt_cloud.yml # Input +project-id: '123456' # Output +``` + +You can find your project ID by selecting your project and clicking on **Develop** in the navigation bar. Your project ID is the number in the URL: https://cloud.getdbt.com/develop/26228/projects/PROJECT_ID. + +If `dbt_cloud.yml` already exists, edit the file, and verify the project ID field uses a valid project ID. + +#### Upgrade the CLI with Brew + +```bash +brew update +brew upgrade dbt-cloud-cli +``` + +## Using dbt Cloud CLI + +**Coming soon** + +## Glossary + +- **dbt cloud API key:** Your API key found by navigating to the **gear icon**, clicking **Profile Settings**, and scrolling down to **API**. +- **Project ID:** The ID of the dbt project you're working with. Can be retrieved from the dbt Cloud URL after a project has been selected, for example, `https://cloud.getdbt.com/deploy/{accountID}/projects/{projectID}` +- **Development credentials:** Your personal warehouse credentials for the project you’re working with. They can be set by selecting the project and entering them in dbt Cloud. Navigate to the **gear icon**, click **Profile Settings**, and click **Credentials** from the left-side menu. diff --git a/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md b/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md index 72fe9e0449c..dae0ee1d178 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md @@ -47,20 +47,16 @@ To configure the SSH tunnel in dbt Cloud, you'll need to provide the hostname/IP - Verify the bastion server has its network security rules set up to accept connections from the [dbt Cloud IP addresses](/docs/cloud/about-cloud/regions-ip-addresses) on whatever port you configured. - Set up the user account by using the bastion servers instance's CLI, The following example uses the username `dbtcloud:` - `sudo groupadd dbtcloud`
- - `sudo useradd -m -g dbtcloud dbtcloud`
- - `sudo su - dbtcloud`
- - `mkdir ~/.ssh`
- - `chmod 700 ~/.ssh`
- - `touch ~/.ssh/authorized_keys`
- - `chmod 600 ~/.ssh/authorized_keys`
- +```shell +sudo groupadd dbtcloud +sudo useradd -m -g dbtcloud dbtcloud +sudo su - dbtcloud +mkdir ~/.ssh +chmod 700 ~/.ssh +touch ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys +``` + - Copy and paste the dbt Cloud generated public key, into the authorized_keys file. The Bastion server should now be ready for dbt Cloud to use as a tunnel into the Redshift environment. diff --git a/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md b/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md index 4f31c56e8aa..62a58f6e1c5 100644 --- a/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md +++ b/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md @@ -30,31 +30,34 @@ to authenticate dbt Cloud to run queries against Snowflake on behalf of a Snowfl ### Key Pair + **Available in:** Development environments, Deployment environments The `Keypair` auth method uses Snowflake's [Key Pair Authentication](https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication) to authenticate Development or Deployment credentials for a dbt Cloud project. -After [generating an encrypted key pair](https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication), be sure to set the `rsa_public_key` for the Snowflake user to authenticate in dbt Cloud: +1. After [generating an encrypted key pair](https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication), be sure to set the `rsa_public_key` for the Snowflake user to authenticate in dbt Cloud: ```sql alter user jsmith set rsa_public_key='MIIBIjANBgkqh...'; ``` -Finally, set the "Private Key" and "Private Key Passphrase" fields in the "Edit -Credentials" page to finish configuring dbt Cloud to authenticate with Snowflake -using a key pair. - -**Note:** At this time ONLY Encrypted Private Keys are supported by dbt Cloud, and the keys must be of size 4096 or smaller. +2. Finally, set the **Private Key** and **Private Key Passphrase** fields in the **Credentials** page to finish configuring dbt Cloud to authenticate with Snowflake using a key pair. + + **Note:** At this time ONLY Encrypted Private Keys are supported by dbt Cloud, and the keys must be of size 4096 or smaller. -In order to successfully fill in the Private Key field, you **must** include the commented lines below when you add the passphrase. Leaving the `PRIVATE KEY PASSPHRASE` field empty will return an error - have a look at the examples below: +3. To successfully fill in the Private Key field, you **must** include commented lines when you add the passphrase. Leaving the **Private Key Passphrase** field empty will return an error. If you're receiving a `Could not deserialize key data` or `JWT token` error, refer to [Troubleshooting](#troubleshooting) for more info. **Example:** + ```sql -----BEGIN ENCRYPTED PRIVATE KEY----- -< encrypted private key contents here > +< encrypted private key contents here - line 1 > +< encrypted private key contents here - line 2 > +< ... > -----END ENCRYPTED PRIVATE KEY----- ``` - + + ### Snowflake OAuth @@ -68,3 +71,36 @@ more information on configuring a Snowflake OAuth connection in dbt Cloud, pleas ## Configuration To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Snowflake-specific configuration](/reference/resource-configs/snowflake-configs). + +## Troubleshooting + + +If you're receiving a `Could not deserialize key data` or `JWT token` error, refer to the following causes and solutions: + +
+ +Error: Could not deserialize key data + + - Possible cause + + - This could be because of mistakes like not copying correctly, missing dashes, or leaving out commented lines. + - Solution + + - You can copy the key from its source and paste it into a text editor to verify it before using it in dbt Cloud. + +
+ +
+Error: JWT token + + - Possible causes + + - This could be a transient issue between Snowflake and dbt Cloud. When connecting to Snowflake, dbt gets a JWT token valid for only 60 seconds. If there's no response from Snowflake within this time, you might see a `JWT token is invalid` error in dbt Cloud. + - The public key was not entered correctly in Snowflake. + + - Solutions + + - dbt needs to retry connections to Snowflake. + - Confirm and enter Snowflake's public key correctly. Additionally, you can reach out to Snowflake for help or refer to this Snowflake doc for more info: [Key-Based Authentication Failed with JWT token is invalid Error](https://community.snowflake.com/s/article/Key-Based-Authentication-Failed-with-JWT-token-is-invalid-Error). + +
diff --git a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md index a1527ebd609..582d3cbd4ba 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md @@ -6,8 +6,7 @@ sidebar_label: Develop in the IDE tags: [IDE] --- -The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. The IDE leverages the open-source [dbt-rpc](/reference/commands/rpc) plugin to recompile only the changes made in your project. - +The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. ## Prerequisites @@ -37,7 +36,7 @@ The Cloud IDE is a powerful tool that can help streamline and govern your data p All of these [features](#cloud-ide-features) work together to create a powerful editing environment that can help you write and maintain high-quality SQL code in less time. Whether you're a seasoned developer or just starting out, the Cloud IDE has everything you need to be productive, collaborative, and efficient. - + ## Cloud IDE features @@ -55,7 +54,7 @@ To stay informed on IDE updates, read [dbt Cloud IDE release notes](/tags/ide), | **File state indicators** | Ability to see when changes or actions have been made to the file. The indicators **M, D, A,** and **•** appear to the right of your file or folder name and indicate the actions performed:

- Unsaved **(•)** — The IDE detects unsaved changes to your file/folder
- Modification **(M)** — The IDE detects a modification of existing files/folders
- Added **(A)** — The IDE detects added files
- Deleted **(D)** — The IDE detects deleted files. | **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Execute git commands using the git button
- Link to the repo directly by clicking the branch name | | **Project documentation** | Generate and view your [project documentation](/docs/collaborate/build-and-view-your-docs) for your dbt project in real-time. You can inspect and verify what your project's documentation will look like before you deploy your changes to production. | -| **Preview and Compile button** | You can run your code against your data platform by clicking the **Preview**. Use the **Compile** button in the IDE to generate executable SQL, which occurs locally within dbt. | +| **Preview and Compile button** | You can [compile or preview](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section) code, a snippet of dbt code, or one of your dbt models after editing and saving. | | **Build, test, and run button** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. | **Command bar** | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking History on the left of the bar. | **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right-clicking on the breadcrumb file. @@ -75,8 +74,8 @@ To stay informed on IDE updates, read [dbt Cloud IDE release notes](/tags/ide), There are three start-up states when using or launching the Cloud IDE: - **Creation start —** This is the state where you are starting the IDE for the first time. You can also view this as a *cold start* (see below), and you can expect this state to take longer because the git repository is being cloned. -- **Cold start —** This is the process of starting a new develop session, which will be available for you for three hours. The environment automatically turns off three hours after the last activity with the rpc server. This includes compile, preview, or any dbt invocation, however, it *does not* include editing and saving a file. --** Hot start —** This is the state of resuming an existing or active develop session within three hours of the last activity. +- **Cold start —** This is the process of starting a new develop session, which will be available for you for three hours. The environment automatically turns off three hours after the last activity. This includes compile, preview, or any dbt invocation, however, it *does not* include editing and saving a file. +- **Hot start —** This is the state of resuming an existing or active develop session within three hours of the last activity. ### Work retention @@ -85,7 +84,7 @@ The Cloud IDE needs explicit action to save your changes. There are three ways y - **Unsaved, local code —** The browser stores your code only in its local storage. In this state, you might need to commit any unsaved changes in order to switch branches or browsers. If you have saved and committed changes, you can access the "Change branch" option even if there are unsaved changes. But if you attempt to switch branches without saving changes, a warning message will appear, notifying you that you will lose any unsaved changes. -- **Saved but uncommitted code —** When you save a file, the data gets stored in durable, long-term storage. To access the Change branch option, you must "Commit and sync" or "Revert" changes - changing branches isn't available for saved-but-uncommitted code. +- **Saved but uncommitted code —** When you save a file, the data gets stored in durable, long-term storage, but isn't synced back to git. To switch branches using the **Change branch** option, you must "Commit and sync" or "Revert" changes. Changing branches isn't available for saved-but-uncommitted code. This is to ensure your uncommitted changes don't get lost. - **Committed code —** This is stored in the branch with your git provider and you can check out other (remote) branches. ## Access the Cloud IDE @@ -152,7 +151,7 @@ The dbt Cloud IDE makes it possible to [build and view](/docs/collaborate/build-
Can I be a contributor to dbt Cloud?
-
Anyone can contribute to the dbt project. And whether it's a dbt package, a plugin, dbt-core, or this documentation site, contributing to the open source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and give back to the community. See Contributing for details on what to expect when contributing to the dbt open source software (OSS).
+
Anyone can contribute to the dbt project. And whether it's a dbt package, a plugin, dbt-core, or this documentation site, contributing to the open-source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and give back to the community. See Contributing for details on what to expect when contributing to the dbt open source software (OSS).
diff --git a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md index 63a4f9a0312..de643413a8a 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md @@ -28,7 +28,7 @@ The IDE streamlines your workflow, and features a popular user interface layout 4. **File Explorer —** The File Explorer shows the filetree of your repository. You can: - Click on any file in the filetree to open the file in the File Editor. - Click and drag files between directories to move files. - - Right click a file to access the sub-menu options like duplicate file, copy file name, copy as `ref`, rename, delete. + - Right-click a file to access the sub-menu options like duplicate file, copy file name, copy as `ref`, rename, delete. - **Note**: To perform these actions, the user must not be in `read-only` mode, which generally happens when the user is viewing the default branch. - Use file indicators, located to the right of your files or folder name, to see when changes or actions were made: * Unsaved (•) — The IDE detects unsaved changes to your file/folder @@ -72,23 +72,35 @@ The IDE features some delightful tools and layouts to make it easier for you to - **Git Diff View —** Clicking on a file in the **Changes** section of the **Version Control Menu** will open the changed file with Git Diff view. The editor will show the previous version on the left and the in-line changes made on the right. -- **Markdown Preview console tab —** The Markdown Preview console tab shows a preview of your .md file's markdown code in your repository, and updates it automatically as you edit your code. +- **Markdown Preview console tab —** The Markdown Preview console tab shows a preview of your .md file's markdown code in your repository and updates it automatically as you edit your code. - **CSV Preview console tab —** The CSV Preview console tab displays the data from your CSV file in a table, which updates automatically as you edit the file in your seed directory. ## Console section + The console section, located below the File editor, includes various console tabs and buttons to help you with tasks such as previewing, compiling, building, and viewing the . Refer to the following sub-bullets for more details on the console tabs and buttons. -1. **Preview button —** When you click on the Preview button, it runs the SQL in the active file editor regardless of whether you have saved it or not, and sends the results to the Results console tab. - * To prevent the IDE from returning too much data and causing browser problems, a limit of 500 is automatically added to queries executed via the Preview Button. However, you can change this by adding `limit your_number` at the end of your SQL statement. For example, `SELECT * FROM` table `limit 100` will return up to 100 rows. Remember that you must write the `limit your_number` explicitly and cannot derive it from a macro. - * The IDE also supports `SELECT TOP #`, which specifies the number of records to return. +1. **Preview button —** When you click on the Preview button, it runs the SQL in the active file editor regardless of whether you have saved it or not and sends the results to the **Results** console tab. You can preview a selected portion of saved or unsaved code by highlighting it and then clicking the **Preview** button. + +
+Row limits in IDE +The dbt Cloud IDE returns default row limits, however, you can also specify the number of records returned. Refer to the following sub-bullets for more info:

+
    +
  • 500-row limit: To prevent the IDE from returning too much data and causing browser problems, dbt automatically sets a 500-row limit when using the Preview Button. You can modify this by adding limit your_number at the end of your SQL statement. For example, SELECT * FROM table limit 100 will return up to 100 rows. Remember that you must write the limit your_number explicitly and cannot derive it from a macro.
  • +
  • Change row limit default: In dbt version 1.6 or higher, you have the ability to change the default limit of 500 rows shown in the Results tab when you run a query. To adjust the setting you can click on Change row display next to the displayed rows. Keep in mind that you can't set it higher than 10,000 rows. If you refresh the page or close your development session, the default limit will go back to 500 rows.
  • +
  • Specify records returned: The IDE also supports SELECT TOP #, which specifies the number of records to return.
  • +
+
+ +2. **Compile button —** The **Compile** button compiles the saved or unsaved SQL code and displays it in the **Compiled Code** tab. -2. **Compile button —** The Compile button compiles the SQL code from the active File Editor, irrespective of its save status, and outputs it to the Compiled Code tab. -3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. +Starting from dbt v1.6 or higher, when you save changes to a model, you can compile its code with the model's specific context. This context is similar to what you'd have when building the model and involves useful context variables like `{{ this }} `or `{{ is_incremental() }}`. + +3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource, and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. 3. **Format button —** The editor has a **Format** button that can reformat the contents of your files. For SQL files, it uses either `sqlfmt` or `sqlfluff`, and for Python files, it uses `black`. @@ -106,9 +118,10 @@ The console section, located below the File editor, includes various console tab ## Invocation history -The Invocation History Drawer stores information on dbt invocations in the IDE. When you invoke a command (like execute a dbt command such as `dbt run`), the associated logs are displayed in the Invocation History Drawer. -You can open the drawer multiple ways: +The Invocation History Drawer stores information on dbt invocations in the IDE. When you invoke a command, like executing a dbt command such as `dbt run`, the associated logs are displayed in the Invocation History Drawer. + +You can open the drawer in multiple ways: - Clicking the `^` icon next to the Command bar on the lower left of the page - Typing a dbt command and pressing enter - Or pressing Control-backtick (or Ctrl + `) @@ -117,15 +130,15 @@ You can open the drawer multiple ways: 1. **Invocation History list —** The left-hand panel of the Invocation History Drawer displays a list of previous invocations in the IDE, including the command, branch name, command status, and elapsed time. -2. **Invocation Summary —** The Invocation Summary, located above **System Logs**, displays information about a selected command from the Invocation History list , such as the command, its status (`Running` if it's still running), the git branch that was active during the command, and the time the command was invoked. +2. **Invocation Summary —** The Invocation Summary, located above **System Logs**, displays information about a selected command from the Invocation History list, such as the command, its status (`Running` if it's still running), the git branch that was active during the command, and the time the command was invoked. -3. **System Logs toggle —** The System Logs toggle, located under the Invocation Summary, allows the user to see the full stdout and debug logs for entirety of the invoked command. +3. **System Logs toggle —** The System Logs toggle, located under the Invocation Summary, allows the user to see the full stdout and debug logs for the entirety of the invoked command. -4. **Command Control button —** Use the Command Control button, located on the right-side, to control your invocation and cancel or rerun a selected run. +4. **Command Control button —** Use the Command Control button, located on the right side, to control your invocation and cancel or rerun a selected run. -5. **Node Summary tab —** Clicking on the Results Status Tabs will filter the Node Status List based on their corresponding status. The available statuses are Pass (successful invocation of a node), Warn (test executed with warning), Error (database error or test failure), Skip (nodes not run due to upstream error), and Queued (nodes that have not executed yet). +5. **Node Summary tab —** Clicking on the Results Status Tabs will filter the Node Status List based on their corresponding status. The available statuses are Pass (successful invocation of a node), Warn (test executed with a warning), Error (database error or test failure), Skip (nodes not run due to upstream error), and Queued (nodes that have not executed yet). 6. **Node result toggle —** After running a dbt command, information about each executed node can be found in a Node Result toggle, which includes a summary and debug logs. The Node Results List lists every node that was invoked during the command. @@ -135,12 +148,12 @@ You can open the drawer multiple ways: ## Modals and Menus Use menus and modals to interact with IDE and access useful options to help your development workflow. -- **Editor tab menu —** To interact with open editor tabs, right-click any tab to access the helpful options in the file tab menu. +- **Editor tab menu —** To interact with open editor tabs, right-click any tab to access the helpful options in the file tab menu. - **File Search —** You can easily search for and navigate between files using the File Navigation menu, which can be accessed by pressing Command-O or Control-O or clicking on the 🔍 icon in the File Explorer. -- **Global Command Palette—** The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, compile, and preview actions, among others. To open the menu, use Command-P or Control-P. +- **Global Command Palette—** The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, and compile, and preview actions, among others. To open the menu, use Command-P or Control-P. - **IDE Status modal —** The IDE Status modal shows the current error message and debug logs for the server. This also contains an option to restart the IDE. Open this by clicking on the IDE Status button. @@ -159,7 +172,7 @@ Use menus and modals to interact with IDE and access useful options to help your * Toggling between dark or light mode for a better viewing experience * Restarting the IDE - * Fully recloning your repository to refresh your git state and viewing status details + * Fully recloning your repository to refresh your git state and view status details * Viewing status details, including the IDE Status modal. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md b/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md index c486ac8b69c..8ffd83ef00e 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md @@ -63,11 +63,11 @@ With the dbt Cloud IDE, you can seamlessly use [SQLFluff](https://sqlfluff.com/) ### Customize linting -SQLFluff is a configurable SQL linter, which means you can configure your own linting rules instead of using the default linting settings in the IDE. +SQLFluff is a configurable SQL linter, which means you can configure your own linting rules instead of using the default linting settings in the IDE. You can exclude files and directories by using a standard `.sqlfluffignore` file. Learn more about the syntax in the [.sqlfluffignore syntax docs](https://docs.sqlfluff.com/en/stable/configuration.html#id2). To configure your own linting rules: -1. Create a new file in the root project directory (the parent or top-level directory for your files). +1. Create a new file in the root project directory (the parent or top-level directory for your files). Note: The root project directory is the directory where your `dbt_project.yml` file resides. 2. Name the file `.sqlfluff` (make sure you add the `.` before `sqlfluff`). 3. [Create](https://docs.sqlfluff.com/en/stable/configuration.html#new-project-configuration) and add your custom config code. 4. Save and commit your changes. @@ -76,7 +76,7 @@ To configure your own linting rules: :::tip Configure dbtonic linting rules -Use the following code example to incorporate well-written dbt code (or dbtonic) to your linting: +Refer to the [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) to add the dbt code (or dbtonic) rules we use for our own projects:
dbtonic config code example provided by dbt Labs @@ -122,6 +122,8 @@ capitalisation_policy = lower group_by_and_order_by_style = implicit ```
+ +For more info on styling best practices, refer to [How we style our SQL](/guides/best-practices/how-we-style/2-how-we-style-our-sql). ::: diff --git a/website/docs/docs/cloud/git/authenticate-azure.md b/website/docs/docs/cloud/git/authenticate-azure.md index 9e755519e67..03020ccca73 100644 --- a/website/docs/docs/cloud/git/authenticate-azure.md +++ b/website/docs/docs/cloud/git/authenticate-azure.md @@ -26,3 +26,4 @@ You will be directed back to dbt Cloud, and your profile should be linked. You a ## FAQs + diff --git a/website/docs/docs/cloud/git/connect-azure-devops.md b/website/docs/docs/cloud/git/connect-azure-devops.md index a84e593a1e2..bc5bb81dd24 100644 --- a/website/docs/docs/cloud/git/connect-azure-devops.md +++ b/website/docs/docs/cloud/git/connect-azure-devops.md @@ -23,3 +23,4 @@ To connect Azure DevOps in dbt Cloud: 2. dbt Cloud developers need to [personally authenticate with Azure DevOps](/docs/cloud/git/authenticate-azure) from dbt Cloud. +If you're a Business Critical customer using [IP restrictions](/docs/cloud/secure/ip-restrictions), ensure you've added the appropriate Azure DevOps CIDRs to your IP restriction rules, or else the Azure DevOps connection will fail. diff --git a/website/docs/docs/cloud/git/connect-github.md b/website/docs/docs/cloud/git/connect-github.md index d5ead96d940..771e4286ef6 100644 --- a/website/docs/docs/cloud/git/connect-github.md +++ b/website/docs/docs/cloud/git/connect-github.md @@ -56,7 +56,7 @@ If you are your GitHub organization owner, you can also configure the dbt Cloud ## Personally authenticate with GitHub -Once the dbt Cloud admin has [set up a connection](docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization GitHub account, you need to personally authenticate, which improves the security of dbt Cloud by enabling you to log in using OAuth through GitHub. +Once the dbt Cloud admin has [set up a connection](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization GitHub account, you need to personally authenticate, which improves the security of dbt Cloud by enabling you to log in using OAuth through GitHub. :::infoGitHub profile connection - dbt Cloud developers on the [Enterprise plan](https://www.getdbt.com/pricing/) must each connect their GitHub profiles to dbt Cloud. This is because the dbt Cloud IDE verifies every developer's read / write access for the dbt repo. @@ -78,5 +78,5 @@ The next time you log into dbt Cloud, you will be able to do so via OAuth throug ## FAQs - + diff --git a/website/docs/docs/cloud/git/connect-gitlab.md b/website/docs/docs/cloud/git/connect-gitlab.md index 1ec8fb08817..53fde5f4878 100644 --- a/website/docs/docs/cloud/git/connect-gitlab.md +++ b/website/docs/docs/cloud/git/connect-gitlab.md @@ -71,6 +71,8 @@ The application form in GitLab should look as follows when completed: Click **Save application** in GitLab, and GitLab will then generate an **Application ID** and **Secret**. These values will be available even if you close the app screen, so this is not the only chance you have to save them. +If you're a Business Critical customer using [IP restrictions](/docs/cloud/secure/ip-restrictions), ensure you've added the appropriate Gitlab CIDRs to your IP restriction rules, or else the Gitlab connection will fail. + ### Adding the GitLab OAuth application to dbt Cloud After you've created your GitLab application, you need to provide dbt Cloud information about the app. In dbt Cloud, account admins should navigate to **Account Settings**, click on the **Integrations** tab, and expand the GitLab section. @@ -122,3 +124,4 @@ If you imported a repository using the dbt Cloud native integration with GitLab, + diff --git a/website/docs/docs/cloud/git/import-a-project-by-git-url.md b/website/docs/docs/cloud/git/import-a-project-by-git-url.md index d84eb99dab8..ba53baa33ea 100644 --- a/website/docs/docs/cloud/git/import-a-project-by-git-url.md +++ b/website/docs/docs/cloud/git/import-a-project-by-git-url.md @@ -125,3 +125,7 @@ Don't see your git provider here? Please [contact dbt Support](mailto:support@ge ## Limited integration Some features of dbt Cloud require a tight integration with your git host, for example, updating GitHub pull requests with dbt Cloud run statuses. Importing your project by a URL prevents you from using these features. Once you give dbt Cloud access to your repository, you can continue to set up your project by adding a connection and creating and running your first dbt Cloud job. + +## FAQs + + diff --git a/website/docs/docs/cloud/manage-access/about-access.md b/website/docs/docs/cloud/manage-access/about-access.md index 9a95d0aeb68..f9f97bc555d 100644 --- a/website/docs/docs/cloud/manage-access/about-access.md +++ b/website/docs/docs/cloud/manage-access/about-access.md @@ -121,12 +121,6 @@ set on the _Internal Analytics_ project. ### Manual assignment - - -- New in version 1.1.23 (March, 2021) - - - dbt Cloud administrators can manually assign users to groups independently of IdP attributes. If a dbt Cloud group is configured _without_ any SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust diff --git a/website/docs/docs/cloud/manage-access/audit-log.md b/website/docs/docs/cloud/manage-access/audit-log.md index 818ec553e7b..98bf660b259 100644 --- a/website/docs/docs/cloud/manage-access/audit-log.md +++ b/website/docs/docs/cloud/manage-access/audit-log.md @@ -16,13 +16,9 @@ The dbt Cloud audit log stores all the events that occurred in your organization ## Accessing the audit log -To access audit log, click the gear icon in the top right, then click **Audit Log**. +To access the audit log, click the gear icon in the top right, then click **Audit Log**. -
- - - -
+ ## Understanding the audit log @@ -161,19 +157,17 @@ The audit log supports various events for different objects in dbt Cloud. You wi You can search the audit log to find a specific event or actor, which is limited to the ones listed in [Events in audit log](#events-in-audit-log). The audit log successfully lists historical events spanning the last 90 days. You can search for an actor or event using the search bar, and then narrow your results using the time window. -
- + -
## Exporting logs You can use the audit log to export all historical audit results for security, compliance, and analysis purposes: -- For events within 90 days — dbt Cloud will automatically display the 90 days selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. +- For events within 90 days — dbt Cloud will automatically display the 90-day selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. - For events beyond 90 days — Select **Export All**. The Account Admin will receive an email link to download a CSV file of all the events that occurred in your organization. - + diff --git a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md index baa92b5a98f..04dfbe093c3 100644 --- a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md +++ b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md @@ -8,20 +8,21 @@ sidebar: "Users and licenses" In dbt Cloud, _licenses_ are used to allocate users to your account. There are three different types of licenses in dbt Cloud: - **Developer** — Granted access to the Deployment and [Development](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) functionality in dbt Cloud. -- **Read-Only** — Intended to view the [artifacts](/docs/deploy/artifacts) created in a dbt Cloud account. -- **IT** — Can manage users, groups, and licenses, among other permissions. Available on Enterprise and Team plans only. +- **Read-Only** — Intended to view the [artifacts](/docs/deploy/artifacts) created in a dbt Cloud account. Read-Only users can receive job notifications but not configure them. +- **IT** — Can manage users, groups, and licenses, among other permissions. IT users can receive job notifications but not configure them. Available on Enterprise and Team plans only. The user's assigned license determines the specific capabilities they can access in dbt Cloud. | Functionality | Developer User | Read-Only Users | IT Users* | | ------------- | -------------- | --------------- | -------- | -| Use the Developer IDE | ✅ | ❌ | ❌ | +| Use the dbt Cloud IDE | ✅ | ❌ | ❌ | | Use Jobs | ✅ | ❌ | ❌ | | Manage Account | ✅ | ❌ | ✅ | | API Access | ✅ | ❌ | ❌ | | Use [Source Freshness](/docs/deploy/source-freshness) | ✅ | ✅ | ❌ | | Use [Docs](/docs/collaborate/build-and-view-your-docs) | ✅ | ✅ | ❌ | -*Available on Enterprise and Team plans only and doesn't count toward seat usage. +| Receive [Job notifications](/docs/deploy/job-notifications) | ✅ | ✅ | ✅ | +*Available on Enterprise and Team plans only and doesn't count toward seat usage. Please note, that IT seats are limited to 1 seat per Team or Enterprise account. ## Licenses diff --git a/website/docs/docs/cloud/manage-access/enterprise-permissions.md b/website/docs/docs/cloud/manage-access/enterprise-permissions.md index 5e80de449f0..5bf3623b105 100644 --- a/website/docs/docs/cloud/manage-access/enterprise-permissions.md +++ b/website/docs/docs/cloud/manage-access/enterprise-permissions.md @@ -2,8 +2,10 @@ title: "Enterprise permissions" id: "enterprise-permissions" description: "Permission sets for Enterprise plans." +hide_table_of_contents: true #For the sake of the tables on this page --- +import Permissions from '/snippets/_enterprise-permissions-table.md'; import SetUpPages from '/snippets/_available-enterprise-only.md'; @@ -13,206 +15,17 @@ help manage access controls within a dbt Cloud account. See the docs on [access control](/docs/cloud/manage-access/about-user-access) for more information on Role-Based access control (RBAC). -## Permission Sets +## Roles and permissions -The following permission sets are available for assignment in dbt Cloud Enterprise accounts. They -can be granted to dbt Cloud groups which are then in turn granted to users. A dbt Cloud group -can be associated with more than one permission set. +The following roles and permission sets are available for assignment in dbt Cloud Enterprise accounts. They can be granted to dbt Cloud groups which are then in turn granted to users. A dbt Cloud group can be associated with more than one role and permission set. Roles with more access take precedence. -### Account Admin + -- **Has permissions on:** Authorized projects, account-level settings -- **License restrictions:** must have a developer license - -Account Admins have unrestricted access to dbt Cloud accounts. Users with Account Admin permissions can: - -- Create, delete, and modify all projects in an account -- Create, delete, and modify Connections -- Create, delete, and modify Environments -- Create, delete, and modify Groups -- Create, delete, and modify Group Memberships -- Create, delete, and modify Jobs -- Create, delete, and modify outbound webhook subscriptions -- Create, delete, and modify Repositories -- Manage Notification Settings -- Manage account-level [artifacts](/docs/deploy/artifacts) -- Run and cancel jobs -- Use the IDE -- View and modify Account Settings -- Generate [service tokens](/docs/dbt-cloud-apis/service-tokens), such as for [API usage](/docs/dbt-cloud-apis/overview) - -### Security Admin - -- **Has permissions on:** Account-level settings -- **License restrictions:** must have a Developer or an IT license - -Security Admins have access to modify certain account-level settings. Users with Security Admin permissions can: - -- View and modify Account Settings such as: - - View, invite, and modify account users - - Create, delete, and modify Groups - - Create, delete, and modify License Mappings - - Create and modify SSO Configurations - - View and export Audit Logs - - Create, delete, and modify IP Restrictions - -### Billing Admin - -- **Has permissions on:** Account-level settings -- **License restrictions:** must have a Developer or an IT license - -Billing Admins have access to modify certain account-level settings related to billing. Users with Billing Admin permissions can: - -- View and modify **Account Settings** such as: - - View billing information - - Modify billing information (accounts on the Team plan) - - This includes modifying Developer Seat counts for the Account - -### Project Creator -- **Has permissions on:** Authorized projects, account-level settings -- **License restrictions:** must have a developer license - -Project Creators can access, create, or modify projects and other settings in dbt Cloud. However, they don't have permission to modify SSO settings or account integrations. - -Users with Project Creator permissions can: - -- View Account Settings -- View and modify project users -- Create, delete, and modify all projects in an account -- Create, delete, and modify Connections -- Create, delete, and modify Environments -- Create, delete, and modify Jobs -- Create, delete, and modify Repositories -- Run and cancel jobs -- Use the IDE -- View Groups -- View Notification Settings - -### Account Viewer - -- **Has permissions on:** Authorized projects, account-level settings -- **License restrictions:** must have a developer license - -Account Viewers have read-only access to dbt Cloud accounts. Users with Account Viewer permissions can: -- View all projects in an account -- View Account Settings -- View account-level artifacts -- View Connections -- View Environments -- View Groups -- View Group Memberships -- View Jobs -- View Notification Settings -- View Repositories - -### Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Admins have unrestricted access to _projects_ in dbt Cloud accounts which they are members of. -Admins can perform the following actions in projects they are assigned to: -- Create, delete, and modify Repositories -- Create, delete, and modify Connections -- Create, delete, and modify Environments -- Create, delete, and modify Group Memberships -- Create, delete, and modify Jobs -- Create, delete, and modify outbound webhook subscriptions -- Run and cancel jobs -- Use the IDE -- View project details - -### Git Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Git Admins can perform the following actions in projects they are assigned to: -- Create, delete, and modify Repositories -- View Connections -- View Environments -- View Jobs -- View project details - -### Database Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Database Admins can perform the following actions in projects they are assigned to: -- Create, delete, and modify Connections -- View Environments -- View Jobs -- View project details -- View Repositories - -### Team Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Team Admins can perform the following actions in projects they are assigned to: -- View Groups -- View Environments -- View Jobs -- View project details -- View Repositories - -### Job Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Job Admins can perform the following actions in projects they are assigned to: -- Create, delete, and modify Jobs -- Run and cancel jobs -- View connections -- View, edit, and create environments -- View historical runs - -### Job Viewer -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Job Viewers can perform the following actions in projects they are assigned to: -- View environments -- View historical runs -- View job definitions - -### Developer -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Developers can perform the following actions in projects they are assigned to: -- Configure personal developer credentials -- Create, delete, and modify Jobs -- Create, delete, and modify outbound webhook subscriptions -- Run and cancel jobs -- Use the IDE - -### Analyst -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Analysts can perform the following actions in projects they are assigned to: -- Configure personal developer credentials -- Configure environmental variables -- View connections -- View environments -- View historical runs -- View job definitions -- Use the IDE - - -### Stakeholder -- **Has permissions on:** Authorized projects -- **License restrictions:** Intended for use with Read-Only licenses, but may be used with Developer licenses. - -Stakeholders can perform the following actions in projects they are assigned to: -- View generated documentation -- View generated source freshness reports -- View the Read-Only dashboard - -## Diagram of the Permission Sets +## Diagram of the permission sets -## How to Set Up RBAC Groups in dbt Cloud +## How to set up RBAC Groups in dbt Cloud Role-Based Access Control (RBAC) is helpful for automatically assigning permissions to dbt admins based on their SSO provider group associations. @@ -221,7 +34,7 @@ Role-Based Access Control (RBAC) is helpful for automatically assigning permissi 1. Select an existing group or create a new group to add RBAC. Name the group (this can be any name you like, but it's recommended to keep it consistent with the SSO groups). If you have configured SSO with SAML 2.0, you may have to use the GroupID instead of the name of the group. -2. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case sensitive and must match the source group formatting. +2. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case-sensitive and must match the source group formatting. 3. Configure the permissions for users within those groups by clicking **Add** in the **Access** section of the window. diff --git a/website/docs/docs/cloud/manage-access/licenses-and-groups.md b/website/docs/docs/cloud/manage-access/licenses-and-groups.md index 88d64f2d9a3..83b926c7445 100644 --- a/website/docs/docs/cloud/manage-access/licenses-and-groups.md +++ b/website/docs/docs/cloud/manage-access/licenses-and-groups.md @@ -117,12 +117,6 @@ set on the _Internal Analytics_ project. ### Manual assignment - - -- New in version 1.1.23 (March, 2021) - - - dbt Cloud administrators can manually assign users to groups independently of IdP attributes. If a dbt Cloud group is configured _without_ any SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md b/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md index fcc9a79e860..349c3d8ecd7 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md @@ -45,7 +45,7 @@ need to select the appropriate directory and then register a new application. | Application Type | Redirect URI | | ----- | ----- | -| Single-Tenant _(recommended)_ | `https://YOUR_AUTH0_URI/login/callback?connection=` | +| Single-Tenant _(recommended)_ | `https://YOUR_AUTH0_URI/login/callback` | | Multi-Tenant | `https://YOUR_AUTH0_URI/login/callback` | @@ -146,7 +146,7 @@ To complete setup, follow the steps below in the dbt Cloud application. | **Client ID** | Paste the **Application (client) ID** recorded in the steps above | | **Client Secret** | Paste the **Client Secret** (remember to use the Secret Value instead of the Secret ID) recorded in the steps above | | **Tenant ID** | Paste the **Directory (tenant ID)** recorded in the steps above | -| **Domain** | Enter the domain name for your Azure directory (eg. `fishtownanalytics.com`). Only users with accounts in this directory with this primary domain will be able to log into the dbt Cloud application. Optionally, you may specify a CSV of domains which are _all_ authorized to access your dbt Cloud account (eg. `fishtownanalytics.com, fishtowndata.com`) Ensure that the domain(s) match the values configured on user accounts in Azure | +| **Domain** | Enter the domain name for your Azure directory (such as `fishtownanalytics.com`). Only use the primary domain; this won't block access for other domains. | | **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/manage-access/sso-overview#auth0-multi-tenant-uris) for your region and plan. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md index a206d359270..19779baf615 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md @@ -49,7 +49,7 @@ Client Secret for use in dbt Cloud. | **Application type** | internal | required | | **Application name** | dbt Cloud | required | | **Application logo** | Download the logo here | optional | -| **Authorized domains** | `getdbt.com` (US) `dbt.com` (EMEA or AU) | If deploying into a VPC, use the domain for your deployment | +| **Authorized domains** | `getdbt.com` (US multi-tenant) `getdbt.com` and `dbt.com`(US Cell 1) `dbt.com` (EMEA or AU) | If deploying into a VPC, use the domain for your deployment | | **Scopes** | `email, profile, openid` | The default scopes are sufficient | diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md index 0d493bcf29f..5ec70443d1f 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-okta.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md @@ -16,8 +16,6 @@ dbt Cloud Enterprise supports single-sign on via Okta (using SAML). Currently su * Just-in-time provisioning This guide outlines the setup process for authenticating to dbt Cloud with Okta. -If you have any questions during the setup process, please contact support -(support@getdbt.com) for assistance. ## Configuration in Okta @@ -63,7 +61,7 @@ Click **Next** to continue. ### Configure SAML Settings -The SAML Settings page configures how Okta and dbt Cloud communicate. You will want to use an [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. If you aren't sure which values you should use, please contact support (support@getdbt.com). +The SAML Settings page configures how Okta and dbt Cloud communicate. You will want to use an [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. To complete this section, you will need a _login slug_. This slug controls the URL where users on your account can log into your application via Okta. Login @@ -95,9 +93,9 @@ Expected **User Attribute Statements**: | Name | Name format | Value | Description | | -------------- | ----------- | -------------------- | -------------------------- | -| `email` | Unspecified | `${user.email}` | _The user's email address_ | -| `first_name` | Unspecified | `${user.firstName}` | _The user's first name_ | -| `last_name` | Unspecified | `${user.lastName}` | _The user's last name_ | +| `email` | Unspecified | `user.email` | _The user's email address_ | +| `first_name` | Unspecified | `user.firstName` | _The user's first name_ | +| `last_name` | Unspecified | `user.lastName` | _The user's last name_ | Expected **Group Attribute Statements**: diff --git a/website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md b/website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md index d5a16e91792..db3efdbeb74 100644 --- a/website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md @@ -56,9 +56,10 @@ Additionally, you may configure the IdP attributes passed from your identity pro | name | name format | value | description | | ---- | ----------- | ----- | ----------- | -| email | Unspecified | ${user.email} | The user's email address | -| first_name | Unspecified | ${user.first_name} | The user's first name | -| last_name | Unspecified | ${user.last_name} | The user's last name | +| email | Unspecified | user.email | The user's email address | +| first_name | Unspecified | user.first_name | The user's first name | +| last_name | Unspecified | user.last_name | The user's last name | +| NameID (if applicable) | Unspecified | user.email | The user's email address | dbt Cloud's [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control) relies on group mappings from the IdP to assign dbt Cloud users to dbt Cloud groups. To @@ -74,7 +75,6 @@ provider to provide group membership information in user attribute called You may use a restricted group attribute statement to limit the groups set to dbt Cloud for each authenticated user. For example, if all of your dbt Cloud groups start with `DBT_CLOUD_...`, you may optionally apply a filter like `Starts With: DBT_CLOUD_`. -Please contact support if you have any questions. ::: ### Collect integration secrets @@ -154,9 +154,9 @@ dbt Cloud expects by using the Attribute Statements and Group Attribute Statemen | Name | Name format | Value | Description | | -------------- | ----------- | -------------------- | -------------------------- | - | `email` | Unspecified | `${user.email}` | _The user's email address_ | - | `first_name` | Unspecified | `${user.firstName}` | _The user's first name_ | - | `last_name` | Unspecified | `${user.lastName}` | _The user's last name_ | + | `email` | Unspecified | `user.email` | _The user's email address_ | + | `first_name` | Unspecified | `user.firstName` | _The user's first name_ | + | `last_name` | Unspecified | `user.lastName` | _The user's last name_ | 4. The following table illustrates expected **Group Attribute Statements**: @@ -263,7 +263,7 @@ Expected **Attributes**: | Google groups | App attributes | | -------------- | -------------- | -| Name of groups | `MemberOf` | +| Name of groups | `groups` | 10. Click **Finish** to continue. @@ -303,7 +303,7 @@ Follow these steps to set up single sign-on (SSO) with dbt Cloud: 5. Select **Integrate any other application you don't find in the gallery (Non-gallery)** as the application type. 6. Click **Create**. 7. You can find the new application by clicking **Enterprise applications** and selecting **All applications**. -8. Click the application you just created and follow the instructions for configuring it in [Configuring SAML endpoints in AD](#configuring-saml-endpoints-in-ad). +8. Click the application you just created. 9. Select **Single sign-on** under Manage in the left navigation. 10. Click **Set up single sign on** under Getting Started. 11. Click **SAML** in "Select a single sign-on method" section. @@ -380,6 +380,7 @@ We recommend using the following values: | name | name format | value | | ---- | ----------- | ----- | +| NameID | Unspecified | Email | | email | Unspecified | Email | | first_name | Unspecified | First Name | | last_name | Unspecified | Last Name | diff --git a/website/docs/docs/cloud/secure/about-privatelink.md b/website/docs/docs/cloud/secure/about-privatelink.md index 7bd18f306b6..29003f65a21 100644 --- a/website/docs/docs/cloud/secure/about-privatelink.md +++ b/website/docs/docs/cloud/secure/about-privatelink.md @@ -4,9 +4,6 @@ id: about-privatelink description: "Configuring PrivateLink for AWS" sidebar_label: "About PrivateLink" --- -:::info -This feature is currently in Private Preview, and these instructions are specific to dbt Cloud multi-tenant Enterprise tier environments hosted on AWS. -::: PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on AWS using [AWS PrivateLink](https://aws.amazon.com/privatelink/) technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. @@ -18,6 +15,7 @@ dbt Labs has a worldwide network of regional VPCs. These VPCs are specifically u dbt Cloud supports the following data platforms for use with the PrivateLink feature. Instructions for enabling PrivateLink for the various data platform providers are unique. The following guides will walk you through the necessary steps, including working with [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support) to complete the connection in the dbt private network and setting up the endpoint in dbt Cloud. -- [Redshift](/docs/cloud/secure/redshift-privatelink) - [Snowflake](/docs/cloud/secure/snowflake-privatelink) - [Databricks](/docs/cloud/secure/databricks-privatelink) +- [Redshift](/docs/cloud/secure/redshift-privatelink) +- [Postgres](/docs/cloud/secure/postgres-privatelink) diff --git a/website/docs/docs/cloud/secure/ip-restrictions.md b/website/docs/docs/cloud/secure/ip-restrictions.md index dacd0c885c4..237de991c02 100644 --- a/website/docs/docs/cloud/secure/ip-restrictions.md +++ b/website/docs/docs/cloud/secure/ip-restrictions.md @@ -19,7 +19,9 @@ To configure IP restrictions, go to **Account Settings** → **IP Restrictions** - Deny IPs flagged by the Security team - Allow only VPN traffic but make an exception for contractors’ IP addresses -IP restrictions will block all user requests done via the API (via personal user token) and the UI. Service tokens are exempt from IP restrictions and can still make requests to dbt Cloud API. +IP restrictions will block all service tokens, user requests done via the API (via personal user token), and the UI if they come from blocked IP addresses. + +For any version control system integrations (Github, Gitlab, ADO, etc.) inbound into dbt Cloud, ensure their IP addresses are added to the allowed list. ### Allowing IPs @@ -32,7 +34,7 @@ To add an IP to the allowlist, from the **IP Restrictions** page: 4. Select **Allow** 5. Add the ranges in the CIDR notation - For example, 1.1.1.1/8 - - You can add multiple ranges followed by commas + - You cannot add multiple ranges in the same rule. Instead, create a rule per CIDR range. 6. Click **Save** Note that simply adding the IP Ranges will not enforce IP restrictions. For more information, see the section “Enabling Restrictions.” diff --git a/website/docs/docs/cloud/secure/postgres-privatelink.md b/website/docs/docs/cloud/secure/postgres-privatelink.md new file mode 100644 index 00000000000..482aeb4040d --- /dev/null +++ b/website/docs/docs/cloud/secure/postgres-privatelink.md @@ -0,0 +1,76 @@ +--- +title: "Configure AWS PrivateLink for Postgres" +id: postgres-privatelink +description: "Configuring PrivateLink for Postgres" +sidebar_label: "PrivateLink for Postgres" +--- + +A Postgres database, hosted either in AWS or in a properly connected on-prem data center, can be accessed through a private network connection using AWS Interface-type PrivateLink. The type of Target Group connected to the Network Load Balancer (NLB) may vary based on the location and type of Postgres instance being connected, as explained in the following steps. + +## Configuring Postgres interface-type PrivateLink + +### 1. Provision AWS resources + +Creating an Interface VPC PrivateLink connection requires creating multiple AWS resources in the account containing, or connected to, the Postgres instance: + +- **Security Group (AWS hosted only)** — If you are connecting to an existing Postgres instance, this likely already exists, however, you may need to add or modify Security Group rules to accept traffic from the Network Load Balancer (NLB) created for this Endpoint Service. +- **Target Group** — The Target Group will be attached to the NLB to tell it where to route requests. There are various target types available for NLB Target Groups, so choose the one appropriate for your Postgres setup. + + - Target Type: + + - _[Amazon RDS for PostgreSQL](https://aws.amazon.com/rds/postgresql/)_ - **IP** + + - Find the IP address of your RDS instance using a command line tool such as `nslookup ` or `dig +short ` with your RDS DNS endpoint + + - _Note_: With RDS Multi-AZ failover capabilities the IP address of your RDS instance can change, at which point your Target Group would need to be updated. See [this AWS blog post](https://aws.amazon.com/blogs/database/access-amazon-rds-across-vpcs-using-aws-privatelink-and-network-load-balancer/) for more details and a possible solution. + + - _On-prem Postgres server_ - **IP** + + - Use the IP address of the on-prem Postgres server linked to AWS through AWS Direct Connect or a Site-to-Site VPN connection + + - _Postgres on EC2_ - **Instance/ASG** (or **IP**) + + - If your Postgres instance is hosted on EC2 the _instance_ Target Group type (or ideally [using the instance type to connect to an auto-scaling group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/attach-load-balancer-asg.html)) can be used to attach the instance without needing a static IP address + + - The IP type can also be used, with the understanding that the IP of the EC2 instance can change if the instance is relaunched for any reason + + - Target Group protocol: **TCP** + +- **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5432` +- **VPC Endpoint Service** — Attach to the newly created NLB. + - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + +### 2. Grant dbt AWS account access to the VPC Endpoint Service + +On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. + + - Principal: `arn:aws:iam::346425330055:role/MTPL_Admin` + + + +### 3. Obtain VPC Endpoint Service Name + +Once the VPC Endpoint Service is provisioned, you can find the service name in the AWS console by navigating to **VPC** → **Endpoint Services** and selecting the appropriate endpoint service. You can copy the service name field value and include it in your communication to dbt Cloud support. + + + +### 4. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Postgres Interface-type +- VPC Endpoint Service Name: +- Postgres server AWS Region (e.g., us-east-1, eu-west-2): +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` + +dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the endpoint is available. + +## Create Connection in dbt Cloud + +Once dbt Cloud support completes the configuration, you can start creating new connections using PrivateLink. + +1. Navigate to **settings** → **Create new project** → select **PostgreSQL** +2. You will see two radio buttons: **Public** and **Private.** Select **Private**. +3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). +4. Configure the remaining data platform details. +5. Test your connection and save it. diff --git a/website/docs/docs/cloud/secure/redshift-privatelink.md b/website/docs/docs/cloud/secure/redshift-privatelink.md index b8c357825f8..3ed49e7bb34 100644 --- a/website/docs/docs/cloud/secure/redshift-privatelink.md +++ b/website/docs/docs/cloud/secure/redshift-privatelink.md @@ -11,6 +11,10 @@ AWS provides two different ways to create a PrivateLink VPC endpoint for a Redsh dbt Cloud supports both types of endpoints, but there are a number of [considerations](https://docs.aws.amazon.com/redshift/latest/mgmt/managing-cluster-cross-vpc.html#managing-cluster-cross-vpc-considerations) to take into account when deciding which endpoint type to use. Redshift-managed provides a far simpler setup with no additional cost, which might make it the preferred option for many, but may not be an option in all environments. Based on these criteria, you will need to determine which is the right type for your system. Follow the instructions from the section below that corresponds to your chosen endpoint type. +:::note Redshift Serverless +While Redshift Serverless does support Redshift-managed type VPC endpoints, this functionality is not currently available across AWS accounts. Due to this limitation, an Interface-type VPC endpoint service must be used for Redshift Serverless cluster PrivateLink connectivity from dbt Cloud. +::: + ## Configuring Redshift-managed PrivateLink 1. On the running Redshift cluster, select the **Properties** tab. @@ -51,11 +55,19 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS - Target Type: **IP** - Notes: - - Use IP addresses from the Redshift cluster’s **Network Interfaces**, _not_ IPs listed in the **Node IP addresses** section as those can change. - + - **Standard Redshift** + + - Use IP addresses from the Redshift cluster’s **Network Interfaces** whenever possible. While IPs listed in the **Node IP addresses** section will work, they are also more likely to change. + + + - There will likely be only one Network Interface (NI) to start, but if the cluster fails over to another availability zone (AZ), a new NI will also be created for that AZ. The NI IP from the original AZ will still work, but the new NI IP can also be added to the Target Group. If adding additional IPs, note that the NLB will also need to add the corresponding AZ. Once created, the NI(s) should stay the same (This is our observation from testing, but AWS does not officially document it). + + - **Redshift Serverless** + + - To find the IP addresses for Redshift Serverless instance locate and copy the endpoint (only the URL listed before the port) in the Workgroup configuration section of the AWS console for the instance. + - - There is likely only one Network Interface (NI) to start, but if the cluster fails over to another availability zone (AZ), a new NI will be created for that AZ as well. The NI IP from the original AZ should still work, but the new NI IP can also be added to the Target Group if desired. If adding additional IPs, note that the NLB will need to add the corresponding AZ as well. Once created, the NI(s) shouldn't change (NOTE: this is our observation from testing, but is not officially documented by AWS). + - From a command line run the command `nslookup ` using the endpoint found in the previous step and use the associated IP(s) for the Target Group. - Target Group protocol: **TCP** @@ -67,7 +79,7 @@ Creating an Interface VPC PrivateLink connection requires creating multiple AWS On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. - - Principal: `arn:aws:iam::346425330055:root` + - Principal: `arn:aws:iam::346425330055:role/MTPL_Admin` diff --git a/website/docs/docs/cloud/secure/snowflake-privatelink.md b/website/docs/docs/cloud/secure/snowflake-privatelink.md index 16138e7e86d..bbbdf04ddf0 100644 --- a/website/docs/docs/cloud/secure/snowflake-privatelink.md +++ b/website/docs/docs/cloud/secure/snowflake-privatelink.md @@ -7,6 +7,14 @@ sidebar_label: "PrivateLink for Snowflake" The following steps will walk you through the setup of a Snowflake AWS PrivateLink endpoint in the dbt Cloud multi-tenant environment. +:::note Snowflake SSO with PrivateLink +Users connecting to Snowflake using SSO over a PrivateLink connection from dbt Cloud will also require access to a PrivateLink endpoint from their local workstation. + +>Currently, for any given Snowflake account, SSO works with only one account URL at a time: either the public account URL or the URL associated with the private connectivity service. + +- [Snowflake SSO with Private Connectivity](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-overview#label-sso-private-connectivity) +::: + ## Configure PrivateLink 1. Open a Support case with Snowflake to allow access from the dbt Cloud AWS account @@ -25,8 +33,10 @@ The following steps will walk you through the setup of a Snowflake AWS PrivateLi Subject: New Multi-Tenant PrivateLink Request - Type: Snowflake - SYSTEM$GET_PRIVATELINK_CONFIG output: +- *Use privatelink-account-url or regionless-privatelink-account-url?: - dbt Cloud multi-tenant environment (US, EMEA, AU): ``` +_*By default dbt Cloud will be configured to use `privatelink-account-url` from the provided [SYSTEM$GET_PRIVATELINK_CONFIG](https://docs.snowflake.com/en/sql-reference/functions/system_get_privatelink_config.html) as the PrivateLink endpoint. Upon request, `regionless-privatelink-account-url` can be used instead._ dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the endpoint is available. diff --git a/website/docs/docs/collaborate/documentation.md b/website/docs/docs/collaborate/documentation.md index b613fd7a5ef..429b5187152 100644 --- a/website/docs/docs/collaborate/documentation.md +++ b/website/docs/docs/collaborate/documentation.md @@ -147,7 +147,6 @@ as well as the repo for this project \[here](https://github.com/dbt-labs/mrr-pla ### Custom project-level overviews -New in v0.18.0 You can set different overviews for each dbt project/package included in your documentation site by creating a docs block named `__[project_name]__`. For example, in order to define diff --git a/website/docs/docs/collaborate/explore-projects.md b/website/docs/docs/collaborate/explore-projects.md new file mode 100644 index 00000000000..a4c914259ef --- /dev/null +++ b/website/docs/docs/collaborate/explore-projects.md @@ -0,0 +1,142 @@ +--- +title: "Explore your dbt projects (beta)" +sidebar_label: "Explore dbt projects (beta)" +description: "Learn about dbt Explorer and how to interact with it to understand, improve, and leverage your data pipelines." +--- + +With dbt Explorer, you can view your project's [resources](/docs/build/projects) (such as models, tests, and metrics) and their lineage to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help your data consumers discover and leverage your dbt resources. + +To display the details about your [project state](/docs/dbt-cloud-apis/project-state), dbt Explorer utilizes the metadata provided through the [Discovery API](/docs/dbt-cloud-apis/discovery-api). The metadata that's available on your project depends on the [deployment environment](/docs/deploy/deploy-environments) you've designated as _production_ in your dbt Cloud project. dbt Explorer automatically retrieves the metadata updates after each job run in the production deployment environment so it will always have the latest state on your project. The metadata it displays depends on the [commands executed by the jobs](/docs/deploy/job-commands). For instance: + +- To update model details or results, you must run `dbt run` or `dbt build` on a given model within a job in the environment. +- To view catalog statistics and columns, you must run `dbt docs generate` within a job in the environment. +- To view test results, you must run `dbt test` or `dbt build` within a job in the environment. +- To view source freshness check results, you must run `dbt source freshness` within a job in the environment. + +The need to run these commands will diminish, and richer, more timely metadata will become available as the Discovery API and its underlying platform evolve. + +:::tip Join the beta + +dbt Explorer is a [beta feature](/docs/dbt-versions/product-lifecycles#dbt-cloud) and subject to change without notification. More updates to this feature coming soon. + +If you’re interested in joining the beta, please contact your account team. + +::: + +## Prerequisites + +- You have a [multi-tenant](/docs/cloud/about-cloud/tenancy#multi-tenant) or AWS single-tenant dbt Cloud account on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You have set up a [production deployment environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) for each project you want to explore. + - There has been at least one successful job run in the production deployment environment. +- You are on the dbt Explorer page. This requires the feature to be enabled for your account. + - To go to the page, select **Explore (Beta)** from the top navigation bar in dbt Cloud. + +## Explore the project’s lineage + +dbt Explorer provides a visualization of your project’s DAG that you can interact with. To start, select **Overview** in the left sidebar and click the **Explore Lineage** button on the main (center) section of the page. + +If you don't see the lineage graph immediately, click **Render Lineage**. It can take some time for the graph to render depending on the size of your project and your computer’s available memory. The graph of very large projects might not render so, instead, you can select a subset of nodes by using selectors. + +The nodes in the lineage graph represent the project’s resources and the edges represent the relationships between the nodes. Resources like tests and macros display in the lineage within their [resource details pages](#view-resource-details) but not within the overall project lineage graph. Nodes are color-coded and include iconography according to their resource type. + +To interact with the lineage graph, you can: + +- Hover over any item in the graph to display the resource’s name and type. +- Zoom in and out on the graph by mouse-scrolling. +- Grab and move the graph. +- Click on a resource to highlight its relationship with other resources in your project. +- [Search and select specific resources](#search-resources) or a subset of the DAG using selectors and lineage (for example, `+[YOUR_RESOURCE_NAME]` displays all nodes upstream of a particular resource). +- [View resource details](#view-resource-details) by selecting a node in the graph (double-clicking). + + + + + +## Search for resources {#search-resources} +With the search bar (on the upper left of the page or in a lineage graph), you can search using keywords or selectors (also known as *selector methods*). The resources that match your search criteria will display as a table in the main section of the page. When you select a resource in the table, its [resource details page](#view-resource-details) will display. + +When using keyword search, dbt Explorer will search through your resources using metadata such as resource type, resource name, column name, source name, tags, schema, database, version, alias/identifier, and package name. + +When using selector search, you can utilize the dbt node selection syntax including set and graph operators (like `+`). To learn more about selectors, refer to [Syntax overview](/reference/node-selection/syntax), [Graph operators](/reference/node-selection/graph-operators), and [Set operators](/reference/node-selection/set-operators). + +Below are the selection methods currently available in dbt Explorer. For more information about each of them, refer to [Methods](/reference/node-selection/methods). + +- **fqn:** — Find resources by [file or fully qualified name](/reference/node-selection/methods#the-file-or-fqn-method). +- **source:** — Find resources by a specified [source](/reference/node-selection/methods#the-source-method). +- **resource_type:** — Find resources by their [type](/reference/node-selection/methods#the-resource_type-method). +- **package:** — Find resources by the [dbt package](/reference/node-selection/methods#the-package-method) that defines them. +- **tag:** — Find resources by a specified [tag](/reference/node-selection/methods#the-tag-method). + + + +- **group:** — Find models defined within a specified [group](/reference/node-selection/methods#the-group-method). +- **access:** — Find models based on their [access](/reference/node-selection/methods#the-access-method) property. + + + + + +## Use the catalog sidebar + +By default, the catalog sidebar lists all your project’s resources. Select any resource type in the list and all those resources in the project will display as a table in the main section of the page. For a description on the different resource types (like models, metrics, and so on), refer to [About dbt projects](https://docs.getdbt.com/docs/build/projects). + +To browse using a different view, you can choose one of these options from the **View by** dropdown: + +- **Resources** (default) — All resources in the project organized by type. +- **Packages** — All resources in the project organized by the project in which they are defined. +- **File Tree** — All resources in the project organized by the file in which they are defined. This mirrors the file tree in your dbt project repository. +- **Database** — All resources in the project organized by the database and schema in which they are built. This mirrors your data platform structure. + + + +## View resource details {#view-resource-details} +You can view the definition and latest run results of any resource in your project. To find a resource and view its details, you can interact with the lineage graph, use search, or browse the catalog. The details (metadata) available to you depends on the resource’s type, its definition, and the [commands](/docs/deploy/job-commands) run within jobs in the production environment. + + + + + +### Example of model details + +An example of the details you might get for a model: + +- **General** — The model’s lineage graph that you can interact with. +- **Code** — The source code and compiled code for the model. +- **Columns** — The available columns in the model. +- **Description** — A [description of the model](/docs/collaborate/documentation#adding-descriptions-to-your-project). +- **Recent** — Information on the last time the model ran, how long it ran for, whether the run was successful, the job ID, and the run ID. +- **Tests** — [Tests](/docs/build/tests) for the model. +- **Details** — Key properties like the model’s relation name (for example, how it’s represented and how you can query it in the data platform: `database.schema.identifier`); model governance attributes like access, group, and if contracted; and more. +- **Relationships** — The nodes the model **Depends On** and is **Referenced by.** + +### Example of exposure details + +An example of the details you might get for an exposure: + +- **Status** — The status on data freshness and data quality. +- **Lineage** — The exposure’s lineage graph. +- **Description** — A description of the exposure. +- **Details** — Details like exposure type, maturity, owner information, and more. +- **Relationships** — The nodes the exposure **Depends On**. + +### Example of test details + +An example of the details you might get for a test: + +- **General** — The test’s lineage graph that you can interact with. +- **Code** — The source code and compiled code for the test. +- **Description** — A description of the test. +- **Recent** — Information on the last time the test ran, how long it ran for, whether the test passed, the job ID, and the run ID. +- **Details** — Details like schema, severity, package, and more. +- **Relationships** — The nodes the test **Depends On**. + +### Example of source details + +An example of the details you might get for each source table within a source collection: + +- **General** — The source’s lineage graph that you can interact with. +- **Columns** — The available columns in the source. +- **Description** — A description of the source. +- **Source freshness** — Information on whether refreshing the data was successful, the last time the source was loaded, the timestamp of when a run generated data, and the run ID. +- **Details** — Details like database, schema, and more. +- **Relationships** — A table that lists all the sources used with their freshness status, the timestamp of when freshness was last checked, and the timestamp of when the source was last loaded. \ No newline at end of file diff --git a/website/docs/docs/collaborate/git/managed-repository.md b/website/docs/docs/collaborate/git/managed-repository.md index d7beb38c4f5..db8e9840ccd 100644 --- a/website/docs/docs/collaborate/git/managed-repository.md +++ b/website/docs/docs/collaborate/git/managed-repository.md @@ -15,6 +15,6 @@ To set up a project with a managed repository: 6. Click **Create**. -dbt Cloud will host and manage this repository for you. If in the future you choose to host this repository yourself, you can contact support to have the contents of your repo transferred to you. +dbt Cloud will host and manage this repository for you. If in the future you choose to host this repository elsewhere, you can export the information from dbt Cloud at any time. ** We do not recommend using a managed repository in a production environment. You will not be able to use git features like pull requests which are part of our recommended version control best practices. diff --git a/website/docs/docs/collaborate/git/pr-template.md b/website/docs/docs/collaborate/git/pr-template.md index 83d620b7af9..ddb4948dad9 100644 --- a/website/docs/docs/collaborate/git/pr-template.md +++ b/website/docs/docs/collaborate/git/pr-template.md @@ -72,7 +72,7 @@ https://gitlab.com///-/merge_requests/new?merge_request[source_branch ### BitBucket ``` -https://bitbucket.org///pull-requests/new?source={{source}} +https://bitbucket.org///pull-requests/new?source={{source}}&dest={{destination}} ``` ### AWS CodeCommit diff --git a/website/docs/docs/collaborate/govern/model-access.md b/website/docs/docs/collaborate/govern/model-access.md index 970f25ef87f..64b70416a2f 100644 --- a/website/docs/docs/collaborate/govern/model-access.md +++ b/website/docs/docs/collaborate/govern/model-access.md @@ -29,7 +29,7 @@ The two concepts will be closely related, as we develop multi-project collaborat ## Groups -Models can be grouped under a common designation with a shared owner. For example, you could group together all models owned by a particular team, related to modeling a specific data source (`github`), or +Models can be grouped under a common designation with a shared owner. For example, you could group together all models owned by a particular team, or related to modeling a specific data source (`github`). Why define model `groups`? There are two reasons: - It turns implicit relationships into an explicit grouping, with a defined owner. By thinking about the interface boundaries _between_ groups, you can have a cleaner (less entangled) DAG. In the future, those interface boundaries could be appropriate as the interfaces between separate projects. diff --git a/website/docs/docs/collaborate/govern/model-contracts.md b/website/docs/docs/collaborate/govern/model-contracts.md index 97667996194..442a20df1b6 100644 --- a/website/docs/docs/collaborate/govern/model-contracts.md +++ b/website/docs/docs/collaborate/govern/model-contracts.md @@ -86,6 +86,91 @@ When building a model with a defined contract, dbt will do two things differentl 1. dbt will run a "preflight" check to ensure that the model's query will return a set of columns with names and data types matching the ones you have defined. This check is agnostic to the order of columns specified in your model (SQL) or YAML spec. 2. dbt will include the column names, data types, and constraints in the DDL statements it submits to the data platform, which will be enforced while building or updating the model's table. +## Platform constraint support + +Select the adapter-specific tab for more information on [constraint](/reference/resource-properties/constraints) support across platforms. Constraints fall into three categories based on support and platform enforcement: + +- **Supported and enforced** — The model won't build if it violates the constraint. +- **Supported and not enforced** — The platform supports specifying the type of constraint, but a model can still build even if building the model violates the constraint. This constraint exists for metadata purposes only. This is common for modern cloud data warehouses and less common for legacy databases. +- **Not supported and not enforced** — You can't specify the type of constraint for the platform. + + + + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:-----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ✅ Enforced | +| foreign_key | ✅ Supported | ✅ Enforced | +| unique | ❌ Not supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:--------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ✅ Enforced | +| foreign_key | ✅ Supported | ✅ Enforced | +| unique | ✅ Supported | ✅ Enforced | +| check | ✅ Supported | ✅ Enforced | + + + + +Currently, `not_null` and `check` constraints are supported and enforced only after a model builds. Because of this platform limitation, dbt considers these constraints `supported` but `not enforced`, which means they're not part of the "model contract" since these constraints can't be enforced at build time. This table will change as the features evolve. + +| Constraint type | Support | Platform enforcement | +|:----------------|:------------|:---------------------| +| not_null | ✅ Supported | ❌ Not enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ✅ Supported | ❌ Not enforced | + + + + +Currently, `not_null` and `check` constraints are supported and enforced only after a model builds. Because of this platform limitation, dbt considers these constraints `supported` but `not enforced`, which means they're not part of the "model contract" since these constraints can't be enforced at build time. This table will change as the features evolve. + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ❌ Not enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ✅ Supported | ❌ Not enforced | + + + + + ## FAQs ### Which models should have contracts? @@ -98,7 +183,7 @@ Any model meeting the criteria described above _can_ define a contract. We recom A model's contract defines the **shape** of the returned dataset. If the model's logic or input data doesn't conform to that shape, the model does not build. -[Tests](docs/build/tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the test. Tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). +[Tests](/docs/build/tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the test. Tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). In some cases, you can replace a test with its equivalent constraint. This has the advantage of guaranteeing the validation at build time, and it probably requires less compute (cost) in your data platform. The prerequisites for replacing a test with a constraint are: - Making sure that your data platform can support and enforce the constraint that you need. Most platforms only enforce `not_null`. @@ -107,8 +192,21 @@ In some cases, you can replace a test with its equivalent constraint. This has t **Why aren't tests part of the contract?** In a parallel for software APIs, the structure of the API response is the contract. Quality and reliability ("uptime") are also very important attributes of an API's quality, but they are not part of the contract per se. When the contract changes in a backwards-incompatible way, it is a breaking change that requires a bump in major version. -### Can I define a "partial" contract? +### Do I need to define every column for a contract? Currently, dbt contracts apply to **all** columns defined in a model, and they require declaring explicit expectations about **all** of those columns. The explicit declaration of a contract is not an accident—it's very much the intent of this feature. -We are investigating the feasibility of supporting "inferred" or "partial" contracts in the future. This would enable you to define constraints and strict data typing for a subset of columns, while still detecting breaking changes on other columns by comparing against the same model in production. If you're interested, please upvote or comment on [dbt-core#7432](https://github.com/dbt-labs/dbt-core/issues/7432). +At the same time, for models with many columns, we understand that this can mean a _lot_ of yaml. We are investigating the feasibility of supporting "inferred" contracts. This would enable you to define constraints and strict data typing for a subset of columns, while still detecting breaking changes on other columns by comparing against the same model in production. This isn't the same as a "partial" contract, because all columns in the model are still checked at runtime, and matched up with what's defined _explicitly_ in your yaml contract or _implicitly_ with the comparison state. If you're interested in "inferred" contract, please upvote or comment on [dbt-core#7432](https://github.com/dbt-labs/dbt-core/issues/7432). + + +### How are breaking changes handled? + +When comparing to a previous project state, dbt will look for breaking changes that could impact downstream consumers. If breaking changes are detected, dbt will present a contract error. + +Breaking changes include: +- Removing an existing column +- Changing the `data_type` of an existing column +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) + +More details are available in the [contract reference](/reference/resource-configs/contract#detecting-breaking-changes). + diff --git a/website/docs/docs/collaborate/govern/model-versions.md b/website/docs/docs/collaborate/govern/model-versions.md index 12599d0b65f..49ed65f9a36 100644 --- a/website/docs/docs/collaborate/govern/model-versions.md +++ b/website/docs/docs/collaborate/govern/model-versions.md @@ -3,20 +3,28 @@ title: "Model versions" id: model-versions sidebar_label: "Model versions" description: "Version models to help with lifecycle management" +keyword: governance, model version, model versioning, dbt model versioning --- + :::info New functionality This functionality is new in v1.5 — if you have thoughts, participate in [the discussion on GitHub](https://github.com/dbt-labs/dbt-core/discussions/6736)! ::: + + +import VersionsCallout from '/snippets/_version-callout.md'; + + + Versioning APIs is a hard problem in software engineering. The root of the challenge is that the producers and consumers of an API have competing incentives: - Producers of an API need the ability to modify its logic and structure. There is a real cost to maintaining legacy endpoints forever, but losing the trust of downstream users is far costlier. - Consumers of an API need to trust in its stability: their queries will keep working, and won't break without warning. Although migrating to a newer API version incurs an expense, an unplanned migration is far costlier. When sharing a final dbt model with other teams or systems, that model is operating like an API. When the producer of that model needs to make significant changes, how can they avoid breaking the queries of its users downstream? -Model versioning is a tool to tackle this problem, thoughtfully and head-on. The goal of is not to make the problem go away entirely, nor to pretend it's easier or simpler than it is. +Model versioning is a tool to tackle this problem, thoughtfully and head-on. The goal is not to make the problem go away entirely, nor to pretend it's easier or simpler than it is. ## Related documentation - [`versions`](/reference/resource-properties/versions) diff --git a/website/docs/docs/collaborate/govern/project-dependencies.md b/website/docs/docs/collaborate/govern/project-dependencies.md index 158c405e4a7..1dbc967e74e 100644 --- a/website/docs/docs/collaborate/govern/project-dependencies.md +++ b/website/docs/docs/collaborate/govern/project-dependencies.md @@ -7,6 +7,12 @@ description: "Reference public models across dbt projects" :::caution Closed Beta - dbt Cloud Enterprise "Project" dependencies and cross-project `ref` are features of dbt Cloud Enterprise, currently in Closed Beta. To access these features while they are in beta, please contact your account team at dbt Labs. + +**Prerequisites:** In order to add project dependencies and resolve cross-project `ref`, you must: +- Have the feature enabled (speak to your account team) +- Use dbt v1.6 for **both** the upstream ("producer") project and the downstream ("consumer") project. +- Have a deployment environment in the upstream ("producer") project [that is set to be your production environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) +- Have a successful run of the upstream ("producer") project ::: For a long time, dbt has supported code reuse and extension by installing other projects as [packages](/docs/build/packages). When you install another project as a package, you are pulling in its full source code, and adding it to your own. This enables you to call macros and run models defined in that other project. diff --git a/website/docs/docs/community-adapters.md b/website/docs/docs/community-adapters.md index 6569a78459b..87d1bd4981e 100644 --- a/website/docs/docs/community-adapters.md +++ b/website/docs/docs/community-adapters.md @@ -15,6 +15,6 @@ Community adapters are adapter plugins contributed and maintained by members of | [Dremio](/docs/core/connect-data-platform/dremio-setup) | [Layer](/docs/core/connect-data-platform/layer-setup) | [Teradata](/docs/core/connect-data-platform/teradata-setup) | | [Exasol Analytics](/docs/core/connect-data-platform/exasol-setup) | [Materialize](/docs/core/connect-data-platform/materialize-setup) | [TiDB](/docs/core/connect-data-platform/tidb-setup) | | [Firebolt](/docs/core/connect-data-platform/firebolt-setup) | [MindsDB](/docs/core/connect-data-platform/mindsdb-setup) | [Vertica](/docs/core/connect-data-platform/vertica-setup) | -| [AWS Glue](/docs/core/connect-data-platform/glue-setup) | [MySQL](/docs/core/connect-data-platform/mysql-setup)| | +| [AWS Glue](/docs/core/connect-data-platform/glue-setup) | [MySQL](/docs/core/connect-data-platform/mysql-setup)| [Upsolver](/docs/core/connect-data-platform/upsolver-setup) | | [Databend Cloud](/docs/core/connect-data-platform/databend-setup) | [fal - Python models](/docs/core/connect-data-platform/fal-setup) | | diff --git a/website/docs/docs/connect-adapters.md b/website/docs/docs/connect-adapters.md index 5632fb3793e..f45da732abb 100644 --- a/website/docs/docs/connect-adapters.md +++ b/website/docs/docs/connect-adapters.md @@ -5,32 +5,18 @@ id: "connect-adapters" Adapters are an essential component of dbt. At their most basic level, they are how dbt connects with the various supported data platforms. At a higher-level, adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt — for more detail, read the [What are adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) guide. -This section provides more details on different ways you can connect dbt to an adapter, and explains what a maintainer is. +This section provides more details on different ways you can connect dbt to an adapter, and explains what a maintainer is. ### Set up in dbt Cloud -Explore the fastest and most reliable way to deploy dbt using dbt Cloud, a hosted architecture that runs dbt Core across your organization. dbt Cloud lets you seamlessly [connect](/docs/cloud/about-cloud-setup) with a variety of [verified](/docs/supported-data-platforms) data platform providers directly in the dbt Cloud UI. - -dbt Cloud supports data platforms that are verified and [maintained](#maintainers) by dbt Labs or partners. This level of support ensures that users can trust certain adapters for use in production. +Explore the fastest and most reliable way to deploy dbt using dbt Cloud, a hosted architecture that runs dbt Core across your organization. dbt Cloud lets you seamlessly [connect](/docs/cloud/about-cloud-setup) with a variety of [verified](/docs/supported-data-platforms) data platform providers directly in the dbt Cloud UI. ### Install using the CLI -Install dbt Core, which is an open-source tool, locally using the CLI. dbt communicates with a number of different data platforms by using a dedicated adapter plugin for each. When you install dbt Core, you'll also need to install the specific adapter for your database, [connect to dbt Core](/docs/core/about-core-setup), and set up a `profiles.yml` file. - -Data platforms supported in dbt Core may be verified or unverified, and are [maintained](#maintainers) by dbt Labs, partners, or community members. +Install dbt Core, which is an open-source tool, locally using the CLI. dbt communicates with a number of different data platforms by using a dedicated adapter plugin for each. When you install dbt Core, you'll also need to install the specific adapter for your database, [connect to dbt Core](/docs/core/about-core-setup), and set up a `profiles.yml` file. With a few exceptions [^1], you can install all [Verified adapters](/docs/supported-data-platforms) from PyPI using `pip install adapter-name`. For example to install Snowflake, use the command `pip install dbt-snowflake`. The installation will include `dbt-core` and any other required dependencies, which may include both other dependencies and even other adapter plugins. Read more about [installing dbt](/docs/core/installation). - -## Maintainers - -Who made and maintains an adapter is certainly relevant, but we recommend using an adapter's verification status to determine the quality and health of an adapter. So far there are three categories of maintainers: - -| Supported by | Maintained By | -| ------------ | ---------------- | -| dbt Labs | dbt Labs maintains a set of adapter plugins for some of the most common databases, warehouses, and platforms. As for why particular data platforms were chosen, see ["Why Verify an Adapter"](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter#why-verify-an-adapter) | -| Partner | These adapter plugins are built and maintained by the same people who build and maintain the complementary data technology. | -| Community | These adapter plugins are contributed and maintained by members of the community. 🌱 | [^1]: Here are the two different adapters. Use the PyPI package name when installing with `pip` | Adapter repo name | PyPI package name | diff --git a/website/docs/docs/core/connect-data-platform/bigquery-setup.md b/website/docs/docs/core/connect-data-platform/bigquery-setup.md index b0fc9fa7cf0..7a2a445be3f 100644 --- a/website/docs/docs/core/connect-data-platform/bigquery-setup.md +++ b/website/docs/docs/core/connect-data-platform/bigquery-setup.md @@ -11,7 +11,7 @@ meta: min_supported_version: 'n/a' slack_channel_name: '#db-bigquery' slack_channel_link: 'https://getdbt.slack.com/archives/C99SNSRTK' - platform_name: 'Big Query' + platform_name: 'BigQuery' config_page: '/reference/resource-configs/bigquery-configs' --- @@ -84,8 +84,6 @@ my-bigquery-db: **Default project** -New in dbt v0.19.0 - If you do not specify a `project`/`database` and are using the `oauth` method, dbt will use the default `project` associated with your user, as defined by `gcloud config set`. ### OAuth Token-Based @@ -233,8 +231,6 @@ my-profile: ### Timeouts and Retries - - The `dbt-bigquery` plugin uses the BigQuery Python client library to submit queries. Each query requires two steps: 1. Job creation: Submit the query job to BigQuery, and receive its job ID. 2. Job execution: Wait for the query job to finish executing, and receive its result. @@ -251,11 +247,17 @@ In older versions of `dbt-bigquery`, this same config was called `timeout_second ::: -No timeout is set by default. (For historical reasons, some query types use a default of 300 seconds when the `job_execution_timeout_seconds` configuration is not set.) When `job_execution_timeout_seconds` is set, if any dbt query, including a model's SQL transformation, takes longer than 300 seconds to complete, BigQuery might cancel the query and issue the following error: +No timeout is set by default. (For historical reasons, some query types use a default of 300 seconds when the `job_execution_timeout_seconds` configuration is not set). When you do set the `job_execution_timeout_seconds`, if any dbt query takes more than 300 seconds to finish, the dbt-bigquery adapter will run into an exception: ``` Operation did not complete within the designated timeout. ``` + +:::caution Note + +The `job_execution_timeout_seconds` represents the number of seconds to wait for the [underlying HTTP transport](https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result). It _doesn't_ represent the maximum allowable time for a BigQuery job itself. So, if dbt-bigquery ran into an exception at 300 seconds, the actual BigQuery job could still be running for the time set in BigQuery's own timeout settings. + +::: You can change the timeout seconds for the job execution step by configuring `job_execution_timeout_seconds` in the BigQuery profile: @@ -315,57 +317,6 @@ my-profile: - - - - -BigQuery supports query timeouts. By default, the timeout is set to 300 seconds. If a dbt model takes longer than this timeout to complete, then BigQuery may cancel the query and issue the following error: - -``` - Operation did not complete within the designated timeout. -``` - -To change this timeout, use the `timeout_seconds` configuration: - - - -```yaml -my-profile: - target: dev - outputs: - dev: - type: bigquery - method: oauth - project: abc-123 - dataset: my_dataset - timeout_seconds: 600 # 10 minutes -``` - - - -The `retries` profile configuration designates the number of times dbt should retry queries that result in unhandled server errors. This configuration is only specified for BigQuery targets. Example: - - - -```yaml -# This example target will retry BigQuery queries 5 -# times with a delay. If the query does not succeed -# after the fifth attempt, then dbt will raise an error - -my-profile: - target: dev - outputs: - dev: - type: bigquery - method: oauth - project: abc-123 - dataset: my_dataset - retries: 5 -``` - - - - ### Dataset locations @@ -387,12 +338,6 @@ my-profile: ### Maximum Bytes Billed - - -- New in dbt v0.17.0 - - - When a `maximum_bytes_billed` value is configured for a BigQuery profile, queries executed by dbt will fail if they exceed the configured maximum bytes threshhold. This configuration should be supplied as an integer number @@ -439,7 +384,6 @@ my-profile: ``` ### Service Account Impersonation -New in v0.18.0 This feature allows users authenticating via local OAuth to access BigQuery resources based on the permissions of a service account. @@ -461,7 +405,6 @@ For a general overview of this process, see the official docs for [Creating Shor ### Execution project -New in v0.21.0 By default, dbt will use the specified `project`/`database` as both: 1. The location to materialize resources (models, seeds, snapshots, etc), unless they specify a custom `project`/`database` config diff --git a/website/docs/docs/core/connect-data-platform/databricks-setup.md b/website/docs/docs/core/connect-data-platform/databricks-setup.md index 0d24a3b04aa..caf52d09de3 100644 --- a/website/docs/docs/core/connect-data-platform/databricks-setup.md +++ b/website/docs/docs/core/connect-data-platform/databricks-setup.md @@ -31,8 +31,6 @@ meta:
  • Minimum data platform version: {frontMatter.meta.min_supported_version}
  • -## Installation and Distribution -

    Installing {frontMatter.meta.pypi_package}

    @@ -48,17 +46,27 @@ pip is the easiest way to install the adapter:

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    -`dbt-databricks` is the recommend adapter for Databricks - -`dbt-databricks` includes features not available in `dbt-spark`: +`dbt-databricks` is the recommended adapter for Databricks. It includes features not available in `dbt-spark`, such as: - Unity Catalog support - No need to install additional drivers or dependencies for use on the CLI - Use of Delta Lake for all models out of the box - SQL macros that are optimized to run with [Photon](https://docs.databricks.com/runtime/photon.html) -### Set up a Databricks Target +## Connecting to Databricks + +To connect to a data platform with dbt Core, create the appropriate _profile_ and _target_ YAML keys/values in the `profiles.yml` configuration file for your Databricks SQL Warehouse/cluster. This dbt YAML file lives in the `.dbt/` directory of your user/home directory. For more info, refer to [Connection profiles](/docs/core/connect-data-platform/connection-profiles) and [profiles.yml](/docs/core/connect-data-platform/profiles.yml). + +`dbt-databricks` can connect to Databricks SQL Warehouses and all-purpose clusters. Databricks SQL Warehouses is the recommended way to get started with Databricks. + +Refer to the [Databricks docs](https://docs.databricks.com/dev-tools/dbt.html#) for more info on how to obtain the credentials for configuring your profile. + +### Examples {#examples} + +You can use either token-based authentication or OAuth client-based authentication to connect to Databricks. Refer to the following examples for more info on how to configure your profile for each type of authentication. + + -dbt-databricks can connect to the Databricks SQL Warehouses and all-purpose clusters. Databricks SQL Warehouses is the recommended way to get started with Databricks. + @@ -69,19 +77,76 @@ your_profile_name: dev: type: databricks catalog: [optional catalog name if you are using Unity Catalog] - schema: [schema name] - host: [yourorg.databrickshost.com] - http_path: [/sql/your/http/path] - token: [dapiXXXXXXXXXXXXXXXXXXXXXXX] # Personal Access Token (PAT) - threads: [1 or more] # optional, default 1 + schema: [schema name] # Required + host: [yourorg.databrickshost.com] # Required + http_path: [/sql/your/http/path] # Required + token: [dapiXXXXXXXXXXXXXXXXXXXXXXX] # Required Personal Access Token (PAT) if using token-based authentication + threads: [1 or more] # Optional, default 1 ``` -See the [Databricks documentation](https://docs.databricks.com/dev-tools/dbt.html#) on how -to obtain the credentials for configuring your profile. + + + + + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: databricks + catalog: [optional catalog name if you are using Unity Catalog] + schema: [schema name] # Required + host: [yourorg.databrickshost.com] # Required + http_path: [/sql/your/http/path] # Required + auth_type: oauth # Required if using OAuth-based authentication + client_id: [OAuth-Client-ID] # The ID of your OAuth application. Required if using OAuth-based authentication + client_secret: [XXXXXXXXXXXXXXXXXXXXXXXXXXX] # OAuth client secret. # Required if using OAuth-based authentication + threads: [1 or more] # Optional, default 1 +``` + + + + + + +## Host parameters + +The following profile fields are always required. + +| Field | Description | Example | +| --------- | ------- | ----------- | +| `host` | The hostname of your cluster.

    Don't include the `http://` or `https://` prefix. | `yourorg.databrickshost.com` | +| `http_path` | The http path to your SQL Warehouse or all-purpose cluster. | `/sql/your/http/path` | +| `schema` | The name of a schema within your cluster's catalog.

    It's _not recommended_ to use schema names that have upper case or mixed case letters. | `my_schema` | + +## Authentication parameters + +The `dbt-databricks` adapter supports both [token-based authentication](/docs/core/connect-data-platform/databricks-setup?tokenoauth=token#examples) and [OAuth client-based authentication](/docs/core/connect-data-platform/databricks-setup?tokenoauth=oauth#examples). + +Refer to the following **required** parameters to configure your profile for each type of authentication: + +| Field | Authentication type | Description | Example | Authentication type | +| --------- | ------- | ----------- | ---- | +| `token` | Token-based | The Personal Access Token (PAT) to connect to Databricks. | `dapiXXXXXXXXX`
    `XXXXXXXXXXXXXX` | +| `client_id` | OAuth-based | The client ID for your Databricks OAuth application.
    | `` | +| `client_secret` | OAuth-based | The client secret for your Databricks OAuth application.
    | `XXXXXXXXXXXXX`
    `XXXXXXXXXXXXXX` | +| `auth_type` | OAuth-based | The type of authorization needed to connect to Databricks.
    | `oauth` | + +## Additional parameters +The following profile fields are optional to set up. They help you configure how your cluster's session and dbt work for your connection. +| Profile field | Description | Example | +| ------------- | ------------------- | --------------- | +| `threads` | The number of threads dbt should use (default is `1`) |`8` | +| `connect_retries` | The number of times dbt should retry the connection to Databricks (default is `1`) |`3` | +| `connect_timeout` | How many seconds before the connection to Databricks should timeout (default behavior is no timeouts) | `1000` | +| `session_properties` | This sets the Databricks session properties used in the connection. Execute `SET -v` to see available options |`ansi_mode: true` | ## Supported Functionality diff --git a/website/docs/docs/core/connect-data-platform/duckdb-setup.md b/website/docs/docs/core/connect-data-platform/duckdb-setup.md index 7896e4abeae..a3fee5a5164 100644 --- a/website/docs/docs/core/connect-data-platform/duckdb-setup.md +++ b/website/docs/docs/core/connect-data-platform/duckdb-setup.md @@ -4,7 +4,7 @@ description: "Read this guide to learn about the DuckDB warehouse setup in dbt." meta: maintained_by: Community authors: 'Josh Wills (https://github.com/jwills)' - github_repo: 'jwills/dbt-duckdb' + github_repo: 'duckdb/dbt-duckdb' pypi_package: 'dbt-duckdb' min_core_version: 'v1.0.1' cloud_support: Not Supported diff --git a/website/docs/docs/core/connect-data-platform/glue-setup.md b/website/docs/docs/core/connect-data-platform/glue-setup.md index e0fb9556853..e56e5bcd902 100644 --- a/website/docs/docs/core/connect-data-platform/glue-setup.md +++ b/website/docs/docs/core/connect-data-platform/glue-setup.md @@ -58,15 +58,14 @@ For further (and more likely up-to-date) info, see the [README](https://github.c ### Configuring your AWS profile for Glue Interactive Session There are two IAM principals used with interactive sessions. -- Client principal: The princpal (either user or role) calling the AWS APIs (Glue, Lake Formation, Interactive Sessions) -from the local client. This is the principal configured in the AWS CLI and likely the same. +- Client principal: The principal (either user or role) calling the AWS APIs (Glue, Lake Formation, Interactive Sessions) +from the local client. This is the principal configured in the AWS CLI and is likely the same. - Service role: The IAM role that AWS Glue uses to execute your session. This is the same as AWS Glue ETL. Read [this documentation](https://docs.aws.amazon.com/glue/latest/dg/glue-is-security.html) to configure these principals. - -You will find bellow a least privileged policy to enjoy all features of **`dbt-glue`** adapter. +You will find below a least privileged policy to enjoy all features of **`dbt-glue`** adapter. Please to update variables between **`<>`**, here are explanations of these arguments: @@ -74,7 +73,7 @@ Please to update variables between **`<>`**, here are explanations of these argu |---|---| |region|The region where your Glue database is stored | |AWS Account|The AWS account where you run your pipeline| -|dbt output database|The database updated by dbt (this is the database configured in the profile.yml of your dbt environment)| +|dbt output database|The database updated by dbt (this is the schema configured in the profile.yml of your dbt environment)| |dbt source database|All databases used as source| |dbt output bucket|The bucket name where the data will be generated by dbt (the location configured in the profile.yml of your dbt environment)| |dbt source bucket|The bucket name of source databases (if they are not managed by Lake Formation)| @@ -113,9 +112,19 @@ Please to update variables between **`<>`**, here are explanations of these argu "glue:BatchDeleteTableVersion", "glue:BatchDeleteTable", "glue:DeletePartition", + "glue:GetUserDefinedFunctions", "lakeformation:ListResources", "lakeformation:BatchGrantPermissions", - "lakeformation:ListPermissions" + "lakeformation:ListPermissions", + "lakeformation:GetDataAccess", + "lakeformation:GrantPermissions", + "lakeformation:RevokePermissions", + "lakeformation:BatchRevokePermissions", + "lakeformation:AddLFTagsToResource", + "lakeformation:RemoveLFTagsFromResource", + "lakeformation:GetResourceLFTags", + "lakeformation:ListLFTags", + "lakeformation:GetLFTag", ], "Resource": [ "arn:aws:glue:::catalog", @@ -189,7 +198,7 @@ Please to update variables between **`<>`**, here are explanations of these argu ### Configuration of the local environment -Because **`dbt`** and **`dbt-glue`** adapter are compatible with Python versions 3.8, and 3.9, check the version of Python: +Because **`dbt`** and **`dbt-glue`** adapters are compatible with Python versions 3.7, 3.8, and 3.9, check the version of Python: ```bash $ python3 --version @@ -212,12 +221,17 @@ $ unzip awscliv2.zip $ sudo ./aws/install ``` -Configure the aws-glue-session package +Install boto3 package ```bash $ sudo yum install gcc krb5-devel.x86_64 python3-devel.x86_64 -y $ pip3 install —upgrade boto3 -$ pip3 install —upgrade aws-glue-sessions +``` + +Install the package: + +```bash +$ pip3 install dbt-glue ``` ### Example config @@ -232,7 +246,6 @@ workers: 2 worker_type: G.1X idle_timeout: 10 schema: "dbt_demo" -database: "dbt_demo" session_provisioning_timeout_in_seconds: 120 location: "s3://dbt_demo_bucket/dbt_demo_data" ``` @@ -241,24 +254,788 @@ location: "s3://dbt_demo_bucket/dbt_demo_data" The table below describes all the options. -|Option |Description | Mandatory | -|---|---|---| -|project_name |The dbt project name. This must be the same as the one configured in the dbt project. |yes| -|type |The driver to use. |yes| -|query-comment |A string to inject as a comment in each query that dbt runs. |no| -|role_arn |The ARN of the interactive session role created as part of the CloudFormation template. |yes| -|region |The AWS Region where you run the data pipeline. |yes| -|workers |The number of workers of a defined workerType that are allocated when a job runs. |yes| -|worker_type |The type of predefined worker that is allocated when a job runs. Accepts a value of Standard, G.1X, or G.2X. |yes| -|schema |The schema used to organize data stored in Amazon S3. |yes| -|database |The database in Lake Formation. The database stores metadata tables in the Data Catalog. |yes| -|session_provisioning_timeout_in_seconds |The timeout in seconds for AWS Glue interactive session provisioning. |yes| -|location |The Amazon S3 location of your target data. |yes| -|idle_timeout |The AWS Glue session idle timeout in minutes. (The session stops after being idle for the specified amount of time.) |no| -|glue_version |The version of AWS Glue for this session to use. Currently, the only valid options are 2.0 and 3.0. The default value is 2.0. |no| -|security_configuration |The security configuration to use with this session. |no| -|connections |A comma-separated list of connections to use in the session. |no| +| Option | Description | Mandatory | +|-----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| +| project_name | The dbt project name. This must be the same as the one configured in the dbt project. | yes | +| type | The driver to use. | yes | +| query-comment | A string to inject as a comment in each query that dbt runs. | no | +| role_arn | The ARN of the glue interactive session IAM role. | yes | +| region | The AWS Region where you run the data pipeline. | yes | +| workers | The number of workers of a defined workerType that are allocated when a job runs. | yes | +| worker_type | The type of predefined worker that is allocated when a job runs. Accepts a value of Standard, G.1X, or G.2X. | yes | +| schema | The schema used to organize data stored in Amazon S3.Additionally, is the database in AWS Lake Formation that stores metadata tables in the Data Catalog. | yes | +| session_provisioning_timeout_in_seconds | The timeout in seconds for AWS Glue interactive session provisioning. | yes | +| location | The Amazon S3 location of your target data. | yes | +| query_timeout_in_minutes | The timeout in minutes for a single query. Default is 300 | no | +| idle_timeout | The AWS Glue session idle timeout in minutes. (The session stops after being idle for the specified amount of time) | no | +| glue_version | The version of AWS Glue for this session to use. Currently, the only valid options are 2.0 and 3.0. The default value is 3.0. | no | +| security_configuration | The security configuration to use with this session. | no | +| connections | A comma-separated list of connections to use in the session. | no | +| conf | Specific configuration used at the startup of the Glue Interactive Session (arg --conf) | no | +| extra_py_files | Extra python Libs that can be used by the interactive session. | no | +| delta_athena_prefix | A prefix used to create Athena-compatible tables for Delta tables (if not specified, then no Athena-compatible table will be created) | no | +| tags | The map of key-value pairs (tags) belonging to the session. Ex: `KeyName1=Value1,KeyName2=Value2` | no | +| seed_format | By default `parquet`, can be Spark format compatible like `csv` or `json` | no | +| seed_mode | By default `overwrite`, the seed data will be overwritten, you can set it to `append` if you just want to add new data in your dataset | no | +| default_arguments | The map of key-value pairs parameters belonging to the session. More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html). Ex: `--enable-continuous-cloudwatch-log=true,--enable-continuous-log-filter=true` | no | +| glue_session_id | re-use the glue-session to run multiple dbt run commands: set a glue session id you need to use | no | +| glue_session_reuse | Reuse the glue-session to run multiple dbt run commands: If set to true, the glue session will not be closed for re-use. If set to false, the session will be closed | no | +| datalake_formats | The ACID data lake format that you want to use if you are doing merge, can be `hudi`, `ìceberg` or `delta` |no| + +## Configs + +### Configuring tables + +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-spark plugin, in addition to the standard [model configs](/reference/model-configs). + +| Option | Description | Required? | Example | +|---------|----------------------------------------------------|-------------------------|--------------------------| +| file_format | The file format to use when creating tables (`parquet`, `csv`, `json`, `text`, `jdbc` or `orc`). | Optional | `parquet`| +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | +| custom_location | By default, the adapter will store your data in the following path: `location path`/`schema`/`table`. If you don't want to follow that default behaviour, you can use this parameter to set your own custom location on S3 | No | `s3://mycustombucket/mycustompath` | +| hudi_options | When using file_format `hudi`, gives the ability to overwrite any of the default configuration options. | Optional | `{'hoodie.schema.on.read.enable': 'true'}` | +## Incremental models + +dbt seeks to offer useful and intuitive modeling abstractions by means of its built-in configurations and materializations. + +For that reason, the dbt-glue plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values: + - **`append`** (default): Insert new records without updating or overwriting any existing data. + - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the table with new data. If no `partition_by` is specified, overwrite the entire table with new data. + - **`merge`** (Apache Hudi and Apache Iceberg only): Match records based on a `unique_key`; update old records, and insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + +Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. + +**Notes:** +The default strategy is **`insert_overwrite`** + +### The `append` strategy + +Following the `append` strategy, dbt will perform an `insert into` statement with all new data. The appeal of this strategy is that it is straightforward and functional across all platforms, file types, connection methods, and Apache Spark versions. However, this strategy _cannot_ update, overwrite, or delete existing data, so it is likely to insert duplicate records for many data sources. + +#### Source code +```sql +{{ config( + materialized='incremental', + incremental_strategy='append', +) }} + +-- All rows returned by this query will be appended to the existing table + +select * from {{ ref('events') }} +{% if is_incremental() %} + where event_ts > (select max(event_ts) from {{ this }}) +{% endif %} +``` +#### Run Code +```sql +create temporary view spark_incremental__dbt_tmp as + + select * from analytics.events + + where event_ts >= (select max(event_ts) from {{ this }}) + +; + +insert into table analytics.spark_incremental + select `date_day`, `users` from spark_incremental__dbt_tmp +``` + +### The `insert_overwrite` strategy + +This strategy is most effective when specified alongside a `partition_by` clause in your model config. dbt will run an [atomic `insert overwrite` statement](https://spark.apache.org/docs/latest/sql-ref-syntax-dml-insert-overwrite-table.html) that dynamically replaces all partitions included in your query. Be sure to re-select _all_ of the relevant data for a partition when using this incremental strategy. + +If no `partition_by` is specified, then the `insert_overwrite` strategy will atomically replace all contents of the table, overriding all existing data with only the new records. The column schema of the table remains the same, however. This can be desirable in some limited circumstances since it minimizes downtime while the table contents are overwritten. The operation is comparable to running `truncate` + `insert` on other databases. For atomic replacement of Delta-formatted tables, use the `table` materialization (which runs `create or replace`) instead. + +#### Source Code +```sql +{{ config( + materialized='incremental', + partition_by=['date_day'], + file_format='parquet' +) }} + +/* + Every partition returned by this query will be overwritten + when this model runs +*/ + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + date_day, + count(*) as users + +from events +group by 1 +``` + +#### Run Code + +```sql +create temporary view spark_incremental__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + date_day, + count(*) as users + + from events + group by 1 + +; + +insert overwrite table analytics.spark_incremental + partition (date_day) + select `date_day`, `users` from spark_incremental__dbt_tmp +``` + +Specifying `insert_overwrite` as the incremental strategy is optional since it's the default strategy used when none is specified. + +### The `merge` strategy + +**Compatibility:** +- Hudi : OK +- Delta Lake : OK +- Iceberg : OK +- Lake Formation Governed Tables : On going + +NB: + +- For Glue 3: you have to set up a [Glue connectors](https://docs.aws.amazon.com/glue/latest/ug/connectors-chapter.html). + +- For Glue 4: use the `datalake_formats` option in your profile.yml + +When using a connector be sure that your IAM role has these policies: +``` +{ + "Sid": "access_to_connections", + "Action": [ + "glue:GetConnection", + "glue:GetConnections" + ], + "Resource": [ + "arn:aws:glue:::catalog", + "arn:aws:glue:::connection/*" + ], + "Effect": "Allow" +} +``` +and that the managed policy `AmazonEC2ContainerRegistryReadOnly` is attached. +Be sure that you follow the getting started instructions [here](https://docs.aws.amazon.com/glue/latest/ug/setting-up.html#getting-started-min-privs-connectors). + + +This [blog post](https://aws.amazon.com/blogs/big-data/part-1-integrate-apache-hudi-delta-lake-apache-iceberg-datasets-at-scale-aws-glue-studio-notebook/) also explains how to set up and works with Glue Connectors + +#### Hudi + +**Usage notes:** The `merge` with Hudi incremental strategy requires: +- To add `file_format: hudi` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: hudi` + - Alternatively, to add a connection in your profile: `connections: name_of_your_hudi_connector` +- To add Kryo serializer in your Interactive Session Config (in your profile): `conf: spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false` + +dbt will run an [atomic `merge` statement](https://hudi.apache.org/docs/writing_data#spark-datasource-writer) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + conf: spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false + datalake_formats: hudi +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key='user_id', + file_format='hudi', + hudi_options={ + 'hoodie.datasource.write.precombine.field': 'eventtime', + } +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + +#### Delta + +You can also use Delta Lake to be able to use merge feature on tables. + +**Usage notes:** The `merge` with Delta incremental strategy requires: +- To add `file_format: delta` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: delta` + - Alternatively, to add a connection in your profile: `connections: name_of_your_delta_connector` +- To add the following config in your Interactive Session Config (in your profile): `conf: "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` + +**Athena:** Athena is not compatible by default with delta tables, but you can configure the adapter to create Athena tables on top of your delta table. To do so, you need to configure the two following options in your profile: +- For Delta Lake 2.1.0 supported natively in Glue 4.0: `extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-2.1.0.jar"` +- For Delta Lake 1.0.0 supported natively in Glue 3.0: `extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-1.0.0.jar"` +- `delta_athena_prefix: "the_prefix_of_your_choice"` +- If your table is partitioned, then the addition of new partition is not automatic, you need to perform an `MSCK REPAIR TABLE your_delta_table` after each new partition adding + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + datalake_formats: delta + conf: "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" + extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-2.1.0.jar" + delta_athena_prefix: "delta" +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key='user_id', + partition_by=['dt'], + file_format='delta' +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen, + current_date() as dt + +from events +group by 1 +``` + +#### Iceberg + +**Usage notes:** The `merge` with Iceberg incremental strategy requires: +- To attach the AmazonEC2ContainerRegistryReadOnly Manged policy to your execution role : +- To add the following policy to your execution role to enable commit locking in a dynamodb table (more info [here](https://iceberg.apache.org/docs/latest/aws/#dynamodb-lock-manager)). Note that the DynamoDB table specified in the resource field of this policy should be the one that is mentioned in your dbt profiles (`--conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable`). By default, this table is named `myGlueLockTable` and is created automatically (with On-Demand Pricing) when running a dbt-glue model with Incremental Materialization and Iceberg file format. If you want to name the table differently or to create your own table without letting Glue do it on your behalf, please provide the `iceberg_glue_commit_lock_table` parameter with your table name (eg. `MyDynamoDbTable`) in your dbt profile. +```yaml +iceberg_glue_commit_lock_table: "MyDynamoDbTable" +``` +- the latest connector for iceberg in AWS marketplace uses Ver 0.14.0 for Glue 3.0, and Ver 1.2.1 for Glue 4.0 where Kryo serialization fails when writing iceberg, use "org.apache.spark.serializer.JavaSerializer" for spark.serializer instead, more info [here](https://github.com/apache/iceberg/pull/546) + +Make sure you update your conf with `--conf spark.sql.catalog.glue_catalog.lock.table=` and, you change the below iam permission with your correct table name. +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CommitLockTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:BatchGetItem", + "dynamodb:BatchWriteItem", + "dynamodb:ConditionCheckItem", + "dynamodb:PutItem", + "dynamodb:DescribeTable", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan", + "dynamodb:Query", + "dynamodb:UpdateItem" + ], + "Resource": "arn:aws:dynamodb:::table/myGlueLockTable" + } + ] +} +``` +- To add `file_format: Iceberg` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: iceberg` + - Alternatively, to add connections in your profile: `connections: name_of_your_iceberg_connector` ( + - For Athena version 3: + - The adapter is compatible with the Iceberg Connector from AWS Marketplace with Glue 3.0 as Fulfillment option and 0.14.0 (Oct 11, 2022) as Software version) + - the latest connector for iceberg in AWS marketplace uses Ver 0.14.0 for Glue 3.0, and Ver 1.2.1 for Glue 4.0 where Kryo serialization fails when writing iceberg, use "org.apache.spark.serializer.JavaSerializer" for spark.serializer instead, more info [here](https://github.com/apache/iceberg/pull/546) + - For Athena version 2: The adapter is compatible with the Iceberg Connector from AWS Marketplace with Glue 3.0 as Fulfillment option and 0.12.0-2 (Feb 14, 2022) as Software version) +- To add the following config in your Interactive Session Config (in your profile): +```--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer + --conf spark.sql.warehouse=s3:// + --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog + --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog + --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO + --conf spark.sql.catalog.glue_catalog.lock-impl=org.apache.iceberg.aws.dynamodb.DynamoDbLockManager + --conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + - For Glue 3.0, set `spark.sql.catalog.glue_catalog.lock-impl` to `org.apache.iceberg.aws.glue.DynamoLockManager` instead + +dbt will run an [atomic `merge` statement](https://iceberg.apache.org/docs/latest/spark-writes/) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. You need to provide a `unique_key` to perform merge operation otherwise it will fail. This key is to provide in a Python list format and can contains multiple column name to create a composite unique_key. + +##### Notes +- When using a custom_location in Iceberg, avoid to use final trailing slash. Adding a final trailing slash lead to an un-proper handling of the location, and issues when reading the data from query engines like Trino. The issue should be fixed for Iceberg version > 0.13. Related Github issue can be find [here](https://github.com/apache/iceberg/issues/4582). +- Iceberg also supports `insert_overwrite` and `append` strategies. +- The `warehouse` conf must be provided, but it's overwritten by the adapter `location` in your profile or `custom_location` in model configuration. +- By default, this materialization has `iceberg_expire_snapshots` set to 'True', if you need to have historical auditable changes, set: `iceberg_expire_snapshots='False'`. +- Currently, due to some dbt internal, the iceberg catalog used internally when running glue interactive sessions with dbt-glue has a hardcoded name `glue_catalog`. This name is an alias pointing to the AWS Glue Catalog but is specific to each session. If you want to interact with your data in another session without using dbt-glue (from a Glue Studio notebook, for example), you can configure another alias (ie. another name for the Iceberg Catalog). To illustrate this concept, you can set in your configuration file : +``` +--conf spark.sql.catalog.RandomCatalogName=org.apache.iceberg.spark.SparkCatalog +``` +And then run in an AWS Glue Studio Notebook a session with the following config: +``` +--conf spark.sql.catalog.AnotherRandomCatalogName=org.apache.iceberg.spark.SparkCatalog +``` +In both cases, the underlying catalog would be the AWS Glue Catalog, unique in your AWS Account and Region, and you would be able to work with the exact same data. Also make sure that if you change the name of the Glue Catalog Alias, you change it in all the other `--conf` where it's used: +``` + --conf spark.sql.catalog.RandomCatalogName=org.apache.iceberg.spark.SparkCatalog + --conf spark.sql.catalog.RandomCatalogName.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog + ... + --conf spark.sql.catalog.RandomCatalogName.lock-impl=org.apache.iceberg.aws.glue.DynamoLockManager +``` +- A full reference to `table_properties` can be found [here](https://iceberg.apache.org/docs/latest/configuration/). +- Iceberg Tables are natively supported by Athena. Therefore, you can query tables created and operated with dbt-glue adapter from Athena. +- Incremental Materialization with Iceberg file format supports dbt snapshot. You are able to run a dbt snapshot command that queries an Iceberg Table and create a dbt fashioned snapshot of it. + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + datalake_formats: iceberg + conf: --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.warehouse=s3://aws-dbt-glue-datalake-1234567890-eu-west-1/dbt_test_project --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.lock-impl=org.apache.iceberg.aws.dynamodb.DynamoDbLockManager --conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key=['user_id'], + file_format='iceberg', + iceberg_expire_snapshots='False', + partition_by=['status'] + table_properties={'write.target-file-size-bytes': '268435456'} +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` +#### Iceberg Snapshot source code example +```sql + +{% snapshot demosnapshot %} + +{{ + config( + strategy='timestamp', + target_schema='jaffle_db', + updated_at='dt', + file_format='iceberg' +) }} + +select * from {{ ref('customers') }} + +{% endsnapshot %} + +``` + +## Monitoring your Glue Interactive Session + +Monitoring is an important part of maintaining the reliability, availability, +and performance of AWS Glue and your other AWS solutions. AWS provides monitoring +tools that you can use to watch AWS Glue, identify the required number of workers +required for your Glue Interactive Session, report when something is wrong and +take action automatically when appropriate. AWS Glue provides Spark UI, +and CloudWatch logs and metrics for monitoring your AWS Glue jobs. +More information on: [Monitoring AWS Glue Spark jobs](https://docs.aws.amazon.com/glue/latest/dg/monitor-spark.html) + +**Usage notes:** Monitoring requires: +- To add the following IAM policy to your IAM role: +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudwatchMetrics", + "Effect": "Allow", + "Action": "cloudwatch:PutMetricData", + "Resource": "*", + "Condition": { + "StringEquals": { + "cloudwatch:namespace": "Glue" + } + } + }, + { + "Sid": "CloudwatchLogs", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "logs:CreateLogStream", + "logs:CreateLogGroup", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:*:*:/aws-glue/*", + "arn:aws:s3:::bucket-to-write-sparkui-logs/*" + ] + } + ] +} +``` + +- To add monitoring parameters in your Interactive Session Config (in your profile). +More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + default_arguments: "--enable-metrics=true, --enable-continuous-cloudwatch-log=true, --enable-continuous-log-filter=true, --enable-spark-ui=true, --spark-event-logs-path=s3://bucket-to-write-sparkui-logs/dbt/" +``` + +If you want to use the Spark UI, you can launch the Spark history server using a +AWS CloudFormation template that hosts the server on an EC2 instance, +or launch locally using Docker. More information on [Launching the Spark history server](https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-history.html#monitor-spark-ui-history-local) + +## Enabling AWS Glue Auto Scaling +Auto Scaling is available since AWS Glue version 3.0 or later. More information +on the following AWS blog post: ["Introducing AWS Glue Auto Scaling: Automatically resize serverless computing resources for lower cost with optimized Apache Spark"](https://aws.amazon.com/blogs/big-data/introducing-aws-glue-auto-scaling-automatically-resize-serverless-computing-resources-for-lower-cost-with-optimized-apache-spark/) + +With Auto Scaling enabled, you will get the following benefits: + +* AWS Glue automatically adds and removes workers from the cluster depending on the parallelism at each stage or microbatch of the job run. + +* It removes the need for you to experiment and decide on the number of workers to assign for your AWS Glue Interactive sessions. + +* Once you choose the maximum number of workers, AWS Glue will choose the right size resources for the workload. +* You can see how the size of the cluster changes during the Glue Interactive sessions run by looking at CloudWatch metrics. +More information on [Monitoring your Glue Interactive Session](#Monitoring-your-Glue-Interactive-Session). + +**Usage notes:** AWS Glue Auto Scaling requires: +- To set your AWS Glue version 3.0 or later. +- To set the maximum number of workers (if Auto Scaling is enabled, the `workers` +parameter sets the maximum number of workers) +- To set the `--enable-auto-scaling=true` parameter on your Glue Interactive Session Config (in your profile). +More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "3.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + default_arguments: "--enable-auto-scaling=true" +``` + +## Access Glue catalog in another AWS account +In many cases, you may need to run you dbt jobs to read from another AWS account. + +Review the following link https://repost.aws/knowledge-center/glue-tables-cross-accounts to set up access policies in source and target accounts + +Add the following `"spark.hadoop.hive.metastore.glue.catalogid="` to your conf in the DBT profile, as such, you can have multiple outputs for each of the accounts that you have access to. + +Note: The access cross-accounts need to be within the same AWS Region +#### Profile config example +```yaml +test_project: + target: dev + outputsAccountB: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "3.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + conf: "--conf hive.metastore.client.factory.class=com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory + --conf spark.hadoop.hive.metastore.glue.catalogid=" +``` + +## Persisting model descriptions + +Relation-level docs persistence is supported since dbt v0.17.0. For more +information on configuring docs persistence, see [the docs](/reference/resource-configs/persist_docs). + +When the `persist_docs` option is configured appropriately, you'll be able to +see model descriptions in the `Comment` field of `describe [table] extended` +or `show table extended in [database] like '*'`. + +## Always `schema`, never `database` + +Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands +`database` to exist at a higher level than `schema`. As such, you should _never_ +use or set `database` as a node config or in the target profile when running dbt-glue. + +If you want to control the schema/database in which dbt will materialize models, +use the `schema` config and `generate_schema_name` macro _only_. +For more information, check the dbt documentation about [custom schemas](https://docs.getdbt.com/docs/build/custom-schemas). + +## AWS Lakeformation integration +The adapter supports AWS Lake Formation tags management enabling you to associate existing tags defined out of dbt-glue to database objects built by dbt-glue (database, table, view, snapshot, incremental models, seeds). + +- You can enable or disable lf-tags management via config, at model and dbt-project level (disabled by default) +- If enabled, lf-tags will be updated on every dbt run. There are table level lf-tags configs and column-level lf-tags configs. +- You can specify that you want to drop existing database, table column Lake Formation tags by setting the drop_existing config field to True (False by default, meaning existing tags are kept) +- Please note that if the tag you want to associate with the table does not exist, the dbt-glue execution will throw an error + +The adapter also supports AWS Lakeformation data cell filtering. +- You can enable or disable data-cell filtering via config, at model and dbt-project level (disabled by default) +- If enabled, data_cell_filters will be updated on every dbt run. +- You can specify that you want to drop existing table data-cell filters by setting the drop_existing config field to True (False by default, meaning existing filters are kept) +- You can leverage excluded_columns_names **OR** columns config fields to perform Column level security as well. **Please note that you can use one or the other but not both**. +- By default, if you don't specify any column or excluded_columns, dbt-glue does not perform Column level filtering and let the principal access all the columns. + +The below configuration let the specified principal (lf-data-scientist IAM user) access rows that have a customer_lifetime_value > 15 and all the columns specified ('customer_id', 'first_order', 'most_recent_order', 'number_of_orders') + +```sql +lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'column_names': ['customer_id', 'first_order', 'most_recent_order', 'number_of_orders'] + } + }, + } + } +``` +The below configuration let the specified principal (lf-data-scientist IAM user) access rows that have a customer_lifetime_value > 15 and all the columns *except* the one specified ('first_name') + +```sql +lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'excluded_column_names': ['first_name'] + } + }, + } + } +``` + +See below some examples of how you can integrate LF Tags management and data cell filtering to your configurations : + +#### At model level +This way of defining your Lakeformation rules is appropriate if you want to handle the tagging and filtering policy at object level. Remember that it overrides any configuration defined at dbt-project level. + +```sql +{{ config( + materialized='incremental', + unique_key="customer_id", + incremental_strategy='append', + lf_tags_config={ + 'enabled': true, + 'drop_existing' : False, + 'tags_database': + { + 'name_of_my_db_tag': 'value_of_my_db_tag' + }, + 'tags_table': + { + 'name_of_my_table_tag': 'value_of_my_table_tag' + }, + 'tags_columns': { + 'name_of_my_lf_tag': { + 'value_of_my_tag': ['customer_id', 'customer_lifetime_value', 'dt'] + }}}, + lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'excluded_column_names': ['first_name'] + } + }, + } + } +) }} + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order, + customer_orders.most_recent_order, + customer_orders.number_of_orders, + customer_payments.total_amount as customer_lifetime_value, + current_date() as dt + + from customers + + left join customer_orders using (customer_id) + + left join customer_payments using (customer_id) + +``` + +#### At dbt-project level +This way you can specify tags and data filtering policy for a particular path in your dbt project (eg. models, seeds, models/model_group1, etc.) +This is especially useful for seeds, for which you can't define configuration in the file directly. + +```yml +seeds: + +lf_tags_config: + enabled: true + tags_table: + name_of_my_table_tag: 'value_of_my_table_tag' + tags_database: + name_of_my_database_tag: 'value_of_my_database_tag' +models: + +lf_tags_config: + enabled: true + drop_existing: True + tags_database: + name_of_my_database_tag: 'value_of_my_database_tag' + tags_table: + name_of_my_table_tag: 'value_of_my_table_tag' +``` + +## Tests + +To perform a functional test: +1. Install dev requirements: +```bash +$ pip3 install -r dev-requirements.txt +``` + +2. Install dev locally +```bash +$ python3 setup.py build && python3 setup.py install_lib +``` + +3. Export variables +```bash +$ export DBT_S3_LOCATION=s3://mybucket/myprefix +$ export DBT_ROLE_ARN=arn:aws:iam::1234567890:role/GlueInteractiveSessionRole +``` + +4. Run the test +```bash +$ python3 -m pytest tests/functional +``` + +For more information, check the dbt documentation about [testing a new adapter](https://docs.getdbt.com/docs/contributing/testing-a-new-adapter). ## Caveats @@ -269,6 +1046,7 @@ Most dbt Core functionality is supported, but some features are only available w Apache Hudi-only features: 1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](/reference/resource-configs/glue-configs#the-merge-strategy)) + Some dbt features, available on the core adapters, are not yet supported on Glue: 1. [Persisting](/reference/resource-configs/persist_docs) column-level descriptions as database comments 2. [Snapshots](/docs/build/snapshots) diff --git a/website/docs/docs/core/connect-data-platform/oracle-setup.md b/website/docs/docs/core/connect-data-platform/oracle-setup.md index f601709654b..b1195fbd0a0 100644 --- a/website/docs/docs/core/connect-data-platform/oracle-setup.md +++ b/website/docs/docs/core/connect-data-platform/oracle-setup.md @@ -455,27 +455,6 @@ dbt_test: - - - -```yaml -dbt_test: - target: "{{ env_var('DBT_TARGET', 'dev') }}" - outputs: - dev: - type: oracle - user: "{{ env_var('DBT_ORACLE_USER') }}" - pass: "{{ env_var('DBT_ORACLE_PASSWORD') }}" - protocol: "tcps" - host: "{{ env_var('DBT_ORACLE_HOST') }}" - port: 1522 - service: "{{ env_var('DBT_ORACLE_SERVICE') }}" - database: "{{ env_var('DBT_ORACLE_DATABASE') }}" - schema: "{{ env_var('DBT_ORACLE_SCHEMA') }}" - threads: 4 -``` - - diff --git a/website/docs/docs/core/connect-data-platform/postgres-setup.md b/website/docs/docs/core/connect-data-platform/postgres-setup.md index 5d7467c786d..f56d3f22576 100644 --- a/website/docs/docs/core/connect-data-platform/postgres-setup.md +++ b/website/docs/docs/core/connect-data-platform/postgres-setup.md @@ -88,33 +88,23 @@ The `search_path` config controls the Postgres "search path" that dbt configures #### role - Added in v0.16.0 - The `role` config controls the Postgres role that dbt assumes when opening new connections to the database. #### sslmode - Added in v0.16.0 - The `sslmode` config controls how dbt connectes to Postgres databases using SSL. See [the Postgres docs](https://www.postgresql.org/docs/9.1/libpq-ssl.html) on `sslmode` for usage information. When unset, dbt will connect to databases using the Postgres default, `prefer`, as the `sslmode`. #### sslcert - Added in v0.21.0 - The `sslcert` config controls the location of the certificate file used to connect to Postgres when using client SSL connections. To use a certificate file that is not in the default location, set that file path using this value. Without this config set, dbt uses the Postgres default locations. See [Client Certificates](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT) in the Postgres SSL docs for the default paths. #### sslkey - Added in v0.21.0 - The `sslkey` config controls the location of the private key for connecting to Postgres using client SSL connections. If this config is omitted, dbt uses the default key location for Postgres. See [Client Certificates](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT) in the Postgres SSL docs for the default locations. #### sslrootcert - Added in v0.21.0 - When connecting to a Postgres server using a client SSL connection, dbt verifies that the server provides an SSL certificate signed by a trusted root certificate. These root certificates are in the `~/.postgresql/root.crt` file by default. To customize the location of this file, set the `sslrootcert` config value to a new file path. ### `keepalives_idle` diff --git a/website/docs/docs/core/connect-data-platform/redshift-setup.md b/website/docs/docs/core/connect-data-platform/redshift-setup.md index a86bc7df849..175d5f6a715 100644 --- a/website/docs/docs/core/connect-data-platform/redshift-setup.md +++ b/website/docs/docs/core/connect-data-platform/redshift-setup.md @@ -46,10 +46,58 @@ pip is the easiest way to install the adapter:

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}.

    +## Configurations -## Authentication Methods +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `type` | redshift | The type of data warehouse you are connecting to| +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `port` | 5439 | | +| `dbname` | my_db | Database name| +| `schema` | my_schema | Schema name| +| `connect_timeout` | `None` or 30 | Number of seconds before connection times out| +| `sslmode` | prefer | optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. For more information on `sslmode`, see Redshift note below| +| `role` | None | Optional, user identifier of the current session| +| `autocreate` | false | Optional, default false. Creates user if they do not exist | +| `db_groups` | ['ANALYSTS'] | Optional. A list of existing database group names that the DbUser joins for the current session | +| `ra3_node` | true | Optional, default False. Enables cross-database sources| +| `autocommit` | true | Optional, default True. Enables autocommit after each statement| +| `retries` | 1 | Number of retries | -### Password-based authentication + +## Authentication Parameters + +The authentication methods that dbt Core supports are: + +- `database` — Password-based authentication (default, will be used if `method` is not provided) +- `IAM` — IAM + + +Click on one of these authentication methods for further details on how to configure your connection profile. Each tab also includes an example `profiles.yml` configuration file for you to review. + + + + + +The following table contains the parameters for the database (password-based) connection method. + + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` | database| Leave this parameter unconfigured, or set this to database | +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `user` | username | Account username to log into your cluster | +| `password` | password1 | Password for authentication | + +
    + +#### Example profiles.yml for database authentication @@ -62,26 +110,29 @@ company-name: host: hostname.region.redshift.amazonaws.com user: username password: password1 - port: 5439 dbname: analytics schema: analytics + port: 5439 + + # Optional Redshift configs: + sslmode: prefer + role: None + ra3_node: true + autocommit: true threads: 4 - connect_timeout: None # optional, number of seconds before connection times out - # search_path: public # optional, not recommended - sslmode: prefer # optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. - role: # optional - ra3_node: true # enables cross-database sources - autocommit: true # enables autocommit after each statement - region: # optional + connect_timeout: None + ``` -### IAM Authentication +
    -To set up a Redshift profile using IAM Authentication, set the `method` -parameter to `iam` as shown below. Note that a password is not required when -using IAM Authentication. For more information on this type of authentication, + + +The following table lists the authentication parameters to use IAM authentication. + +To set up a Redshift profile using IAM Authentication, set the `method` parameter to `iam` as shown below. Note that a password is not required when using IAM Authentication. For more information on this type of authentication, consult the [Redshift Documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/generating-user-credentials.html) and [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.get_cluster_credentials) @@ -92,10 +143,25 @@ Authentication, then your aws credentials are likely misconfigured. Try running `aws configure` to set up AWS access keys, and pick a default region. If you have any questions, please refer to the official AWS documentation on [Configuration and credential file settings](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html). + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` |IAM| use IAM to authenticate | +| `iam_profile` | analyst | dbt will use the specified profile from your ~/.aws/config file | +| `cluster_id` | CLUSTER_ID| Required for IAM | +| `user` | username | Account user to log into your cluster | +| `region` | us-east-1 | Required for IAM authentication | + + +
    + + +#### Example profiles.yml for IAM + ```yaml -my-redshift-db: + my-redshift-db: target: dev outputs: dev: @@ -104,26 +170,32 @@ my-redshift-db: cluster_id: CLUSTER_ID host: hostname.region.redshift.amazonaws.com user: alice - iam_profile: data_engineer # optional - autocreate: true # optional - db_groups: ['ANALYSTS'] # optional - - # Other Redshift configs: - port: 5439 + iam_profile: analyst dbname: analytics schema: analytics + port: 5439 + + # Optional Redshift configs: threads: 4 - connect_timeout: None # optional, number of seconds before connection times out - [retries](#retries): 1 # default 1 retry on error/timeout when opening connections - role: # optional - sslmode: prefer # optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. - ra3_node: true # enables cross-database sources - autocommit: true # optional, enables autocommit after each statement - region: # optional + connect_timeout: None + [retries](#retries): 1 + role: None + sslmode: prefer + ra3_node: true + autocommit: true + region: us-east-1 + autocreate: true + db_groups: ['ANALYSTS'] + ``` +
    + +
    + + ### Specifying an IAM Profile When the `iam_profile` configuration is set, dbt will use the specified profile from your `~/.aws/config` file instead of using the profile name `default` diff --git a/website/docs/docs/core/connect-data-platform/snowflake-setup.md b/website/docs/docs/core/connect-data-platform/snowflake-setup.md index 6bc9c980922..98bcf447fed 100644 --- a/website/docs/docs/core/connect-data-platform/snowflake-setup.md +++ b/website/docs/docs/core/connect-data-platform/snowflake-setup.md @@ -124,7 +124,7 @@ Along with adding the `authenticator` parameter, be sure to run `alter account s To use key pair authentication, omit a `password` and instead provide a `private_key_path` and, optionally, a `private_key_passphrase` in your target. **Note:** Versions of dbt before 0.16.0 required that private keys were encrypted and a `private_key_passphrase` was provided. This behavior was changed in dbt v0.16.0. -Starting from [dbt v1.5.0](/docs/dbt-versions/core), you have the option to use a `private_key` string instead of a `private_key_path`. The `private_key` string should be in Base64-encoded DER format, representing the key bytes. Refer to [Snowflake documentation](https://docs.snowflake.com/developer-guide/python-connector/python-connector-example#using-key-pair-authentication-key-pair-rotation) for more info on how they generate the key. +Starting from [dbt v1.5.0](/docs/dbt-versions/core), you have the option to use a `private_key` string instead of a `private_key_path`. The `private_key` string should be in either Base64-encoded DER format, representing the key bytes, or a plain-text PEM format. Refer to [Snowflake documentation](https://docs.snowflake.com/developer-guide/python-connector/python-connector-example#using-key-pair-authentication-key-pair-rotation) for more info on how they generate the key. @@ -163,9 +163,13 @@ my-snowflake-db: ### SSO Authentication -To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. `authenticator` can be one of 'externalbrowser' or a valid Okta URL. +To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. +`authenticator` can be one of 'externalbrowser' or a valid Okta URL. -**Note**: By default, every connection that dbt opens will require you to re-authenticate in a browser. The Snowflake connector package supports caching your session token, but it [currently only supports Windows and Mac OS](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-use.html#optional-using-connection-caching-to-minimize-the-number-of-prompts-for-authentication). See [the Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for how to enable this feature in your account. +Refer to the following tabs for more info and examples: + + + @@ -175,15 +179,15 @@ my-snowflake-db: outputs: dev: type: snowflake - account: [account id] - user: [username] - role: [user role] + account: [account id] # Snowflake + user: [username] # Snowflake username + role: [user role] # Snowflake user role # SSO config authenticator: externalbrowser - database: [database name] - warehouse: [warehouse name] + database: [database name] # Snowflake database name + warehouse: [warehouse name] # Snowflake warehouse name schema: [dbt schema] threads: [between 1 and 8] client_session_keep_alive: False @@ -199,6 +203,50 @@ my-snowflake-db: + + + + + + +```yaml +my-snowflake-db: + target: dev + outputs: + dev: + type: snowflake + account: [account id] # Snowflake + user: [username] # Snowflake username + role: [user role] # Snowflake user role + + # SSO config -- The three following fields are REQUIRED + authenticator: [Okta account URL] + username: [Okta username] + password: [Okta password] + + database: [database name] # Snowflake database name + warehouse: [warehouse name] # Snowflake warehouse name + schema: [dbt schema] + threads: [between 1 and 8] + client_session_keep_alive: False + query_tag: [anything] + + # optional + connect_retries: 0 # default 0 + connect_timeout: 10 # default: 10 + retry_on_database_errors: False # default: false + retry_all: False # default: false + reuse_connections: False # default: false +``` + + + + + + +**Note**: By default, every connection that dbt opens will require you to re-authenticate in a browser. The Snowflake connector package supports caching your session token, but it [currently only supports Windows and Mac OS](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-use.html#optional-using-connection-caching-to-minimize-the-number-of-prompts-for-authentication). + +Refer to the [Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for info on how to enable this feature in your account. ## Configurations @@ -224,7 +272,7 @@ The "base" configs for Snowflake targets are shown below. Note that you should a | reuse_connections | No | A boolean flag indicating whether to reuse idle connections to help reduce total connections opened. Default is `False`. | ### account -For AWS accounts in the US West default region, you can use `abc123` (without any other segments). For some AWS accounts you will have to append the region and/or cloud platform. For example, `abc123.eu-west-1` or `abc123.eu-west-2.aws`. For GCP and Azure-based accounts, you have to append the region and cloud platform, such as `gcp` or `azure`, respectively. For example, `abc123.us-central1.gcp`. For details, see Snowflake's documentation: "[Specifying Region Information in Your Account Hostname](https://docs.snowflake.com/en/user-guide/intro-regions.html#specifying-region-information-in-your-account-hostname)" and "[Account Identifier Formats by Cloud Platform and Region](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#account-identifier-formats-by-cloud-platform-and-region)". +For AWS accounts in the US West default region, you can use `abc123` (without any other segments). For some AWS accounts you will have to append the region and/or cloud platform. For example, `abc123.eu-west-1` or `abc123.eu-west-2.aws`. For GCP and Azure-based accounts, you have to append the region and cloud platform, such as `gcp` or `azure`, respectively. For example, `abc123.us-central1.gcp`. For details, see Snowflake's documentation: "[Specifying Region Information in Your Account Hostname](https://docs.snowflake.com/en/user-guide/intro-regions.html#specifying-region-information-in-your-account-hostname)". Please also note that the Snowflake account name should only be the without the prefixed . Relevant documentation: "[Account Identifier Formats by Cloud Platform and Region](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#account-identifier-formats-by-cloud-platform-and-region)". ### client_session_keep_alive diff --git a/website/docs/docs/core/connect-data-platform/spark-setup.md b/website/docs/docs/core/connect-data-platform/spark-setup.md index 2e3b5a66de8..895f0559953 100644 --- a/website/docs/docs/core/connect-data-platform/spark-setup.md +++ b/website/docs/docs/core/connect-data-platform/spark-setup.md @@ -57,15 +57,11 @@ $ pip install "dbt-spark[ODBC]" $ pip install "dbt-spark[PyHive]" ``` - - ```zsh # session connections $ pip install "dbt-spark[session]" ``` - -

    Configuring {frontMatter.meta.pypi_package}

    For {frontMatter.meta.platform_name}-specific configuration please refer to {frontMatter.meta.platform_name} Configuration

    @@ -80,7 +76,6 @@ dbt-spark can connect to Spark clusters by three different methods: - [`thrift`](#thrift) connects directly to the lead node of a cluster, either locally hosted / on premise or in the cloud (e.g. Amazon EMR). - [`http`](#http) is a more generic method for connecting to a managed service that provides an HTTP endpoint. Currently, this includes connections to a Databricks interactive cluster. - - [`session`](#session) connects to a pySpark session, running locally or on a remote machine. @@ -88,12 +83,9 @@ dbt-spark can connect to Spark clusters by three different methods: The `session` connection method is intended for advanced users and experimental dbt development. This connection method is not supported by dbt Cloud. ::: - ### ODBC -New in v0.18.1 - Use the `odbc` connection method if you are connecting to a Databricks SQL endpoint or interactive cluster via ODBC driver. (Download the latest version of the official driver [here](https://databricks.com/spark/odbc-driver-download).) @@ -119,9 +111,7 @@ your_profile_name: port: [port] # default 443 user: [user] server_side_parameters: - # cluster configuration parameters, otherwise applied via `SET` statements - # for example: - # "spark.databricks.delta.schema.autoMerge.enabled": True + "spark.driver.memory": "4g" ``` @@ -148,6 +138,8 @@ your_profile_name: auth: [e.g. KERBEROS] kerberos_service_name: [e.g. hive] use_ssl: [true|false] # value of hive.server2.use.SSL, default false + server_side_parameters: + "spark.driver.memory": "4g" ```
    @@ -176,6 +168,8 @@ your_profile_name: user: [user] connect_timeout: 60 # default 10 connect_retries: 5 # default 0 + server_side_parameters: + "spark.driver.memory": "4g" ``` @@ -184,8 +178,6 @@ Databricks interactive clusters can take several minutes to start up. You may include the optional profile configs `connect_timeout` and `connect_retries`, and dbt will periodically retry the connection. - - ### Session Use the `session` method if you want to run `dbt` against a pySpark session. @@ -201,14 +193,12 @@ your_profile_name: method: session schema: [database/schema name] host: NA # not used, but required by `dbt-core` + server_side_parameters: + "spark.driver.memory": "4g" ``` - - - - ## Optional configurations ### Retries @@ -227,6 +217,12 @@ connect_retries: 3 + + + +### Server side configuration + +Spark can be customized using [Application Properties](https://spark.apache.org/docs/latest/configuration.html). Using these properties the execution can be customized, for example, to allocate more memory to the driver process. Also, the Spark SQL runtime can be set through these properties. For example, this allows the user to [set a Spark catalogs](https://spark.apache.org/docs/latest/configuration.html#spark-sql). ## Caveats diff --git a/website/docs/docs/core/connect-data-platform/upsolver-setup.md b/website/docs/docs/core/connect-data-platform/upsolver-setup.md new file mode 100644 index 00000000000..68cfa3045cd --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/upsolver-setup.md @@ -0,0 +1,84 @@ +--- +title: "Upsolver setup" +description: "Read this guide to learn how to configure Upsolver with dbt." +id: "upsolver-setup" +meta: + maintained_by: Upsolver Team + authors: Upsolver Team + github_repo: 'Upsolver/dbt-upsolver' + pypi_package: 'dbt-upsolver' + min_core_version: 'v1.5.0' + cloud_support: Not Supported + min_supported_version: 'n/a' + slack_channel_name: 'Upsolver Comunity' + slack_channel_link: 'https://join.slack.com/t/upsolvercommunity/shared_invite/zt-1zo1dbyys-hj28WfaZvMh4Z4Id3OkkhA' + platform_name: 'Upsolver' + config_page: '/reference/resource-configs/upsolver-configs' +--- + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + +## Authentication Methods + +### User / Token authentication + +Upsolver can be configured using basic user/token authentication as shown below. + + + +```yaml +my-upsolver-db: + target: dev + outputs: + dev: + type: upsolver + api_url: https://mt-api-prod.upsolver.com + + user: [username] + token: [token] + + database: [database name] + schema: [schema name] + threads: [1 or more] + + ``` + + + +## Configurations + +The configs for Upsolver targets are shown below. + +### All configurations + +| Config | Required? | Description | +| ------ | --------- | ----------- | +| token | Yes | The token to connect Upsolver [Upsolver's documentation](https://docs.upsolver.com/sqlake/api-integration) | +| user | Yes | The user to log in as | +| database | Yes | The database that dbt should create models in | +| schema | Yes | The schema to build models into by default | +| api_url | Yes | The API url to connect. Common value ```https://mt-api-prod.upsolver.com``` | diff --git a/website/docs/docs/core/homebrew-install.md b/website/docs/docs/core/homebrew-install.md index ab80cc1148f..2e2676c4a95 100644 --- a/website/docs/docs/core/homebrew-install.md +++ b/website/docs/docs/core/homebrew-install.md @@ -3,6 +3,13 @@ title: "Install with Homebrew" description: "You can use Homebrew to install dbt Core and adapter plugins from the command line." --- +:::caution + +Starting with v1.6, dbt Labs will no longer maintain Homebrew formulae as a supported installation method for dbt-core and adapters. For more on our rationale, consult this discussion: +- [Installing dbt Core: saying goodbye to brew and hello to "bundles"](https://github.com/dbt-labs/dbt-core/discussions/8277) + +::: + dbt Labs maintains Homebrew formulae for the four oldest and most popular adapter plugins: Postgres, Redshift, Snowflake, and BigQuery. We recommend you use Homebrew if you meet these conditions: diff --git a/website/docs/docs/core/pip-install.md b/website/docs/docs/core/pip-install.md index 26a15d8ad37..a35ad5f0d77 100644 --- a/website/docs/docs/core/pip-install.md +++ b/website/docs/docs/core/pip-install.md @@ -5,14 +5,37 @@ description: "You can use pip to install dbt Core and adapter plugins from the c You need to use `pip` to install dbt Core on Windows or Linux operating systems. You can use `pip` or [Homebrew](/docs/core/homebrew-install) for installing dbt Core on a MacOS. -You can install dbt Core and plugins using `pip` because they are Python modules distributed on [PyPI](https://pypi.org/project/dbt/). We recommend using virtual environments when installing with `pip`. - +You can install dbt Core and plugins using `pip` because they are Python modules distributed on [PyPI](https://pypi.org/project/dbt/). - +### Using virtual environments +We recommend using virtual environments (venv) to namespace pip modules. + +1. Create a new venv: + +```shell +python3 -m venv dbt-env # create the environment +``` + +2. Activate that same virtual environment each time you create a shell window or session: + +```shell +source dbt-env/bin/activate # activate the environment for Mac and Linux OR +dbt-env\Scripts\activate # activate the environment for Windows +``` + +#### Create an alias +To activate your dbt environment with every new shell window or session, you can create an alias for the source command in your $HOME/.bashrc, $HOME/.zshrc, or whichever config file your shell draws from. + +For example, add the following to your rc file, replacing with the path to your virtual environment configuration. + +```shell +alias env_dbt='source /bin/activate' +``` +### Installing the adapter Once you know [which adapter](/docs/supported-data-platforms) you're using, you can install it as `dbt-`. For example, if using Postgres: ```shell diff --git a/website/docs/docs/dbt-cloud-apis/apis-overview.md b/website/docs/docs/dbt-cloud-apis/apis-overview.md index 9f7c22a7580..b7d722747d8 100644 --- a/website/docs/docs/dbt-cloud-apis/apis-overview.md +++ b/website/docs/docs/dbt-cloud-apis/apis-overview.md @@ -8,10 +8,11 @@ id: "overview" Accounts on the _Team_ and _Enterprise_ plans can query the dbt Cloud APIs. -dbt Cloud provides two APIs: +dbt Cloud provides the following APIs: - The [dbt Cloud Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) can be used to administrate a dbt Cloud account. - The [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api) can be used to fetch metadata related to the state and health of your dbt project. +- The [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) provides multiple API options which allow you to query your metrics defined in the dbt Semantic Layer. If you want to learn more about webhooks, refer to [Webhooks for your jobs](/docs/deploy/webhooks). diff --git a/website/docs/docs/dbt-cloud-apis/discovery-api.md b/website/docs/docs/dbt-cloud-apis/discovery-api.md index 16c9bc16ec4..e4441aa55a2 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-api.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-api.md @@ -2,9 +2,9 @@ title: "About the Discovery API" --- -Every time dbt Cloud runs a project, it generates and stores information about the project. The metadata includes details about your project’s models, sources, and other nodes along with their execution results. With the dbt Cloud Discovery API, you can query this comprehensive information to gain a better understanding of your DAG and the data it produces. +Every time dbt Cloud runs a project, it generates and stores information about the project. The metadata includes details about your project’s models, sources, and other nodes along with their execution results. With the dbt Cloud Discovery API, you can query this comprehensive information to gain a better understanding of your DAG and the data it produces. -By leveraging the metadata in dbt Cloud, you can create systems for data monitoring and alerting, lineage exploration, and automated reporting. This can help you improve data discovery, data quality, and pipeline operations within your organization. +By leveraging the metadata in dbt Cloud, you can create systems for data monitoring and alerting, lineage exploration, and automated reporting. This can help you improve data discovery, data quality, and pipeline operations within your organization. You can access the Discovery API through [ad hoc queries](/docs/dbt-cloud-apis/discovery-querying), custom applications, a wide range of [partner ecosystem integrations](https://www.getdbt.com/product/integrations/) (like BI/analytics, catalog and governance, and quality and observability), and by using dbt Cloud features like [model timing](/docs/deploy/run-visibility#model-timing) and [dashboard status tiles](/docs/deploy/dashboard-status-tiles). @@ -17,13 +17,13 @@ You can query the dbt Cloud metadata: - At the job level for results on a specific dbt Cloud job run for a given resource type, like `models` or `test`. :::tip Public Preview -The Discovery API is currently available in Public Preview for dbt Cloud accounts on a Team or Enterprise plan. It’s available to all multi-tenant and to only select single-tenant accounts (please ask your account team to confirm). Preview features are stable and can be considered for production deployments, but there might still be some planned additions and modifications to product behavior before moving to General Availability. For details, refer to [dbt Product lifecycles](/docs/dbt-versions/product-lifecycles). +The Discovery API is currently available in Public Preview for dbt Cloud accounts on a Team or Enterprise plan. It’s available to all multi-tenant and to only select single-tenant accounts (please ask your account team to confirm). Preview features are stable and can be considered for production deployments, but there might still be some planned additions and modifications to product behavior before moving to General Availability. For details, refer to [dbt Product lifecycles](/docs/dbt-versions/product-lifecycles). ::: ## What you can use the Discovery API for -Click the tabs below to learn more about the API's use cases, the analysis you can do, and the results you can achieve by integrating with it. +Click the tabs below to learn more about the API's use cases, the analysis you can do, and the results you can achieve by integrating with it. To use the API directly or integrate your tool with it, refer to [Uses case and examples](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) for detailed information. @@ -33,7 +33,7 @@ To use the API directly or integrate your tool with it, refer to [Uses case and Use the API to look at historical information like model build time to determine the health of your dbt projects. Finding inefficiencies in orchestration configurations can help decrease infrastructure costs and improve timeliness. To learn more about how to do this, refer to [Performance](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#performance). -You can use, for example, the [model timing](/docs/deploy/run-visibility#model-timing) tab to help identify and optimize bottlenecks in model builds: +You can use, for example, the [model timing](/docs/deploy/run-visibility#model-timing) tab to help identify and optimize bottlenecks in model builds: @@ -53,7 +53,7 @@ Use the API to find and understand dbt assets in integrated tools using informat Data producers must manage and organize data for stakeholders, while data consumers need to quickly and confidently analyze data on a large scale to make informed decisions that improve business outcomes and reduce organizational overhead. The API is useful for discovery data experiences in catalogs, analytics, apps, and machine learning (ML) tools. It can help you understand the origin and meaning of datasets for your analysis. - + @@ -75,7 +75,7 @@ Use the API to review dataset changes and uses by examining exposures, lineage, ## Types of project state -There are two types of [project state](/docs/dbt-cloud-apis/project-state) at the environment level that you can query the results of: +There are two types of [project state](/docs/dbt-cloud-apis/project-state) at the environment level that you can query the results of: - **Definition** — The logical state of a dbt project’s [resources](/docs/build/projects) that update when the project is changed. - **Applied** — The output of successful dbt DAG execution that creates or describes the state of the database (for example: `dbt run`, `dbt test`, source freshness, and so on) @@ -86,5 +86,4 @@ These states allow you to easily examine the difference between a model’s defi - [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) - [Query the Discovery API](/docs/dbt-cloud-apis/discovery-querying) -- [Schema](/docs/dbt-cloud-apis/discovery-schema-model) - +- [Schema](/docs/dbt-cloud-apis/discovery-schema-job) diff --git a/website/docs/docs/dbt-cloud-apis/discovery-querying.md b/website/docs/docs/dbt-cloud-apis/discovery-querying.md index 77fed109c68..ba1365e632b 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-querying.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-querying.md @@ -1,14 +1,14 @@ --- title: "Query the Discovery API" id: "discovery-querying" -sidebar_label: "Query the Discovery API" +sidebar_label: "Query the Discovery API" --- -The Discovery API supports ad-hoc queries and integrations.. If you are new to the API, read the [Discovery API overview](/docs/dbt-cloud-apis/discovery-api) for an introduction. +The Discovery API supports ad-hoc queries and integrations. If you are new to the API, refer to [About the Discovery API](/docs/dbt-cloud-apis/discovery-api) for an introduction. -Use the Discovery API to evaluate data pipeline health and project state across runs or at a moment in time. dbt Labs provide a [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql) for this API, enabling you to run queries and browse the schema. +Use the Discovery API to evaluate data pipeline health and project state across runs or at a moment in time. dbt Labs provide a [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql) for this API, enabling you to run queries and browse the schema. -Since GraphQL describes the data in the API, the schema displayed in the GraphQL explorer accurately represents the graph and fields available to query. +Since GraphQL describes the data in the API, the schema displayed in the GraphQL explorer accurately represents the graph and fields available to query. @@ -16,17 +16,17 @@ Since GraphQL describes the data in the API, the schema displayed in the GraphQL Currently, authorization of requests takes place [using a service token](/docs/dbt-cloud-apis/service-tokens). dbt Cloud admin users can generate a Metadata Only service token that is authorized to execute a specific query against the Discovery API. -Once you've created a token, you can use it in the Authorization header of requests to the dbt Cloud Discovery API. Be sure to include the Token prefix in the Authorization header, or the request will fail with a `401 Unauthorized` error. Note that `Bearer` can be used instead of `Token` in the Authorization header. Both syntaxes are equivalent. +Once you've created a token, you can use it in the Authorization header of requests to the dbt Cloud Discovery API. Be sure to include the Token prefix in the Authorization header, or the request will fail with a `401 Unauthorized` error. Note that `Bearer` can be used instead of `Token` in the Authorization header. Both syntaxes are equivalent. -## Access the Discovery API +## Access the Discovery API 1. Create a [service account token](/docs/dbt-cloud-apis/service-tokens) to authorize requests. dbt Cloud Admin users can generate a _Metadata Only_ service token, which can be used to execute a specific query against the Discovery API to authorize requests. -2. Find your API URL using the endpoint `https://metadata.{YOUR_ACCESS_URL}/graphql`. +2. Find your API URL using the endpoint `https://metadata.{YOUR_ACCESS_URL}/graphql`. * Replace `{YOUR_ACCESS_URL}` with the appropriate [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. For example, if your multi-tenant region is North America, your endpoint is `https://metadata.cloud.getdbt.com/graphql`. If your multi-tenant region is EMEA, your endpoint is `https://metadata.emea.dbt.com/graphql`. -3. For specific query points, refer to the [schema documentation](/docs/dbt-cloud-apis/discovery-schema-model). +3. For specific query points, refer to the [schema documentation](/docs/dbt-cloud-apis/discovery-schema-job). ## Run queries using HTTP requests @@ -36,7 +36,7 @@ You can run queries by sending a `POST` request to the `https://metadata.YOUR_AC * `YOUR_TOKEN` in the Authorization header with your actual API token. Be sure to include the Token prefix. * `QUERY_BODY` with a GraphQL query, for example `{ "query": "" }` * `VARIABLES` with a dictionary of your GraphQL query variables, such as a job ID or a filter. -* `ENDPOINT` with the endpoint you're querying, such as environment. +* `ENDPOINT` with the endpoint you're querying, such as environment. ```shell curl 'https://metadata.YOUR_ACCESS_URL/graphql' \ @@ -48,10 +48,13 @@ You can run queries by sending a `POST` request to the `https://metadata.YOUR_AC Python example: -```py -response = requests.post('YOUR_ACCESS_URL', -headers={"authorization": "Bearer "+YOUR_TOKEN, "content-type": "application/json"}, -json={"query": QUERY_BODY, "variables": VARIABLES}) +```python +response = requests.post( + 'YOUR_ACCESS_URL', + headers={"authorization": "Bearer "+YOUR_TOKEN, "content-type": "application/json"}, + json={"query": QUERY_BODY, "variables": VARIABLES} +) + metadata = response.json()['data'][ENDPOINT] ``` @@ -63,75 +66,82 @@ There are several illustrative example queries on this page. For more examples, ## Reasonable use Discovery (GraphQL) API usage is subject to request rate and response size limits to maintain the performance and stability of the metadata platform and prevent abuse. -- The current request rate limit is 200 requests for a given IP address within a minute. If you exceed this limit, you will receive an HTTP 429 response status. -- Environment-level endpoints will be subject to response size limits in the future. The depth of the graph should not exceed three levels. A user can paginate up to 500 items per query. -- Job-level endpoints are subject to query complexity limits. Nested nodes (like parents), code (like rawCode), and catalog columns are considered as most complex. Overly complex queries should be broken up into separate queries with only necessary fields included. dbt Labs recommends using the environment endpoint instead for most use cases to get the latest descriptive and result metadata for a dbt Cloud project. + +Job-level endpoints are subject to query complexity limits. Nested nodes (like parents), code (like rawCode), and catalog columns are considered as most complex. Overly complex queries should be broken up into separate queries with only necessary fields included. dbt Labs recommends using the environment endpoint instead for most use cases to get the latest descriptive and result metadata for a dbt Cloud project. ## Retention limits You can use the Discovery API to query data from the previous three months. For example, if today was April 1st, you could query data back to January 1st. ## Run queries with the GraphQL explorer -You can run ad-hoc queries directly in the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and use the document explorer on the left-hand side, where you can see all possible nodes and fields. +You can run ad-hoc queries directly in the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and use the document explorer on the left-hand side to see all possible nodes and fields. + +Refer to the [Apollo explorer documentation](https://www.apollographql.com/docs/graphos/explorer/explorer) for setup and authorization info. -Refer to the [Apollo explorer documentation](https://www.apollographql.com/docs/graphos/explorer/explorer) for setup and authorization info. +1. Access the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and select fields you want to query. -1. Access the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and select fields you'd like query. +2. Select **Variables** at the bottom of the explorer and replace any `null` fields with your unique values. -2. Go to **Variables** at the bottom of the explorer and replace any `null` fields with your unique values. +3. [Authenticate](https://www.apollographql.com/docs/graphos/explorer/connecting-authenticating#authentication) using Bearer auth with `YOUR_TOKEN`. Select **Headers** at the bottom of the explorer and select **+New header**. -3. [Authenticate](https://www.apollographql.com/docs/graphos/explorer/connecting-authenticating#authentication) via Bearer auth with `YOUR_TOKEN`. Go to **Headers** at the bottom of the explorer and select **+New header**. +4. Select **Authorization** in the **header key** dropdown list and enter your Bearer auth token in the **value** field. Remember to include the Token prefix. Your header key should be in this format: `{"Authorization": "Bearer }`. + + + -4. Select **Authorization** in the **header key** drop-down list and enter your Bearer auth token in the **value** field. Remember to include the Token prefix. Your header key should look like this `{"Authorization": "Bearer }`.
    -5. Run your query by pressing the blue query button in the top-right of the Operation editor (to the right of the query). You should see a successful query response on the right side of the explorer. +1. Run your query by clicking the blue query button in the top right of the **Operation** editor (to the right of the query). You should see a successful query response on the right side of the explorer. + + + ### Fragments -Use the [`..on`](https://www.apollographql.com/docs/react/data/fragments/) notation to query across lineage and retrieve results from specific node types. +Use the [`... on`](https://www.apollographql.com/docs/react/data/fragments/) notation to query across lineage and retrieve results from specific node types. ```graphql - -environment(id: $environmentId) { - applied { - models(first: $first,filter:{uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - name - ancestors(types:[Model, Source, Seed, Snapshot]) { - ... on ModelAppliedStateNode { - name - resourceType - materializedType - executionInfo { - executeCompletedAt +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" }) { + edges { + node { + name + ancestors(types: [Model, Source, Seed, Snapshot]) { + ... on ModelAppliedStateNestedNode { + name + resourceType + materializedType + executionInfo { + executeCompletedAt + } } - } - ... on SourceAppliedStateNode { - sourceName - name - resourceType - freshness { - maxLoadedAt + ... on SourceAppliedStateNestedNode { + sourceName + name + resourceType + freshness { + maxLoadedAt + } } - } - ... on SnapshotAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt + ... on SnapshotAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } } - } - ... on SeedAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt + ... on SeedAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } } } } @@ -140,56 +150,59 @@ environment(id: $environmentId) { } } } - ``` ### Pagination -Querying large datasets can impact performance on multiple functions in the API pipeline. Pagination eases the burden by returning smaller data sets one page at a time. This is useful for returning a particular portion of the dataset or the entire dataset piece-by-piece to enhance performance. dbt Cloud utilizes cursor-based pagination, which makes it easy to return pages of constantly changing data. +Querying large datasets can impact performance on multiple functions in the API pipeline. Pagination eases the burden by returning smaller data sets one page at a time. This is useful for returning a particular portion of the dataset or the entire dataset piece-by-piece to enhance performance. dbt Cloud utilizes cursor-based pagination, which makes it easy to return pages of constantly changing data. -Use the `PageInfo` object to return information about the page. The following fields are available: +Use the `PageInfo` object to return information about the page. The available fields are: -- `startCursor` string type - corresponds to the first `node` in the `edge`. -- `endCursor` string type - corresponds to the last `node` in the `edge`. -- `hasNextPage` boolean type - whether there are more `nodes` after the returned results. -- `hasPreviousPage` boolean type - whether `nodes` exist before the returned results. +- `startCursor` string type — Corresponds to the first `node` in the `edge`. +- `endCursor` string type — Corresponds to the last `node` in the `edge`. +- `hasNextPage` boolean type — Whether or not there are more `nodes` after the returned results. There are connection variables available when making the query: -- `first` integer type - will return the first 'n' `nodes` for each page, up to 500. -- `after` string type sets the cursor to retrieve `nodes` after. It's best practice to set the `after` variable with the object ID defined in the `endcursor` of the previous page. +- `first` integer type — Returns the first n `nodes` for each page, up to 500. +- `after` string type — Sets the cursor to retrieve `nodes` after. It's best practice to set the `after` variable with the object ID defined in the `endCursor` of the previous page. + +Below is an example that returns the `first` 500 models `after` the specified Object ID in the variables. The `PageInfo` object returns where the object ID where the cursor starts, where it ends, and whether there is a next page. -The following example shows that we're returning the `first` 500 models `after` the specified Object ID in the variables. The `PageInfo` object will return where the object ID where the cursor starts, where it ends, and whether there is a next page. + + - + -Here is a code example of the `PageInfo` object: +Below is a code example of the `PageInfo` object: ```graphql pageInfo { - startCursor - endCursor - hasNextPage - } - totalCount # Total number of pages - + startCursor + endCursor + hasNextPage +} +totalCount # Total number of records across all pages ``` ### Filters -Filtering helps to narrow down the results of an API query. Want to query and return only models and tests that are failing? Or find models that are taking too long to run? You can fetch execution details such as [`executionTime`](/docs/dbt-cloud-apis/discovery-schema-models#fields), [`runElapsedTime`](/docs/dbt-cloud-apis/discovery-schema-models#fields), or [`status`](/docs/dbt-cloud-apis/discovery-schema-models#fields). This helps data teams monitor the performance of their models, identify bottlenecks, and optimize the overall data pipeline. +Filtering helps to narrow down the results of an API query. If you want to query and return only models and tests that are failing or find models that are taking too long to run, you can fetch execution details such as [`executionTime`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields), [`runElapsedTime`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields), or [`status`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields). This helps data teams monitor the performance of their models, identify bottlenecks, and optimize the overall data pipeline. -In the following example, we can see that we're filtering results to models that have succeeded on their `lastRunStatus`: +Below is an example that filters for results of models that have succeeded on their `lastRunStatus`: - + -Here is a code example that filters for models that have an error on their last run and tests that have failed: +Below is an example that filters for models that have an error on their last run and tests that have failed: -```graphql + + -environment(id: $environmentId) { +```graphql +query ModelsAndTests($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { applied { - models(first: $first, filter: {lastRunStatus:error}) { + models(first: $first, filter: { lastRunStatus: error }) { edges { node { name @@ -199,7 +212,7 @@ environment(id: $environmentId) { } } } - tests(first: $first, filter: {status:"fail"}) { + tests(first: $first, filter: { status: "fail" }) { edges { node { name @@ -208,12 +221,13 @@ environment(id: $environmentId) { } } } - } + } + } + } } - ``` ## Related content - [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) -- [Schema](/docs/dbt-cloud-apis/discovery-schema-model) +- [Schema](/docs/dbt-cloud-apis/discovery-schema-job) diff --git a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md index 030688d9aeb..8efb1ec0d37 100644 --- a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md +++ b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md @@ -3,9 +3,9 @@ title: "Use cases and examples for the Discovery API" sidebar_label: "Uses and examples" --- -With the Discovery API, you can query the metadata in dbt Cloud to learn more about your dbt deployments and the data it generates to analyze them and make improvements. +With the Discovery API, you can query the metadata in dbt Cloud to learn more about your dbt deployments and the data it generates to analyze them and make improvements. -You can use the API in a variety of ways to get answers to your business questions. Below describes some of the uses of the API and is meant to give you an idea of the questions this API can help you answer. +You can use the API in a variety of ways to get answers to your business questions. Below describes some of the uses of the API and is meant to give you an idea of the questions this API can help you answer. | Use Case | Outcome | Example Questions | | --- | --- | --- | @@ -17,13 +17,13 @@ You can use the API in a variety of ways to get answers to your business questio ## Performance -You can use the Discovery API to identify inefficiencies in pipeline execution to reduce infrastructure costs and improve timeliness. Below are example questions and queries you can run. +You can use the Discovery API to identify inefficiencies in pipeline execution to reduce infrastructure costs and improve timeliness. Below are example questions and queries you can run. For performance use cases, people typically query the historical or latest applied state across any part of the DAG (for example, models) using the `environment`, `modelByEnvironment`, or job-level endpoints. ### How long did each model take to run? -It’s helpful to understand how long it takes to build models (tables) and tests to execute during a dbt run. Longer model build times result in higher infrastructure costs and fresh data arriving later to stakeholders. Analyses like these can be in observability tools or ad-hoc queries, like in a notebook. +It’s helpful to understand how long it takes to build models (tables) and tests to execute during a dbt run. Longer model build times result in higher infrastructure costs and fresh data arriving later to stakeholders. Analyses like these can be in observability tools or ad-hoc queries, like in a notebook. @@ -35,33 +35,42 @@ Data teams can monitor the performance of their models, identify bottlenecks, an 1. Use latest state environment-level API to get a list of all executed models and their execution time. Then, sort the models by `executionTime` in descending order. ```graphql -query Query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first) { - edges { - node { - name - uniqueId - materializedType - executionInfo { - lastSuccessRunId - executionTime - executeStartedAt - } - } - } +query AppliedModels($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + name + uniqueId + materializedType + executionInfo { + lastSuccessRunId + executionTime + executeStartedAt } + } } + } } + } } ``` -2. Get the most recent 20 run results for the longest running model. Review the results of the model across runs, or you can go to the job/run or commit itself to investigate further. +2. Get the most recent 20 run results for the longest running model. Review the results of the model across runs or you can go to the job/run or commit itself to investigate further. ```graphql -query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int!) { - modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId, lastRunCount: $lastRunCount) { +query ModelHistoricalRuns( + $environmentId: BigInt! + $uniqueId: String + $lastRunCount: Int +) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns( + uniqueId: $uniqueId + lastRunCount: $lastRunCount + ) { name runId runElapsedTime @@ -70,12 +79,15 @@ query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int!) { executeStartedAt executeCompletedAt status + } } + } } ``` 3. Use the query results to plot a graph of the longest running model’s historical run time and execution time trends. + ```python # Import libraries import os @@ -88,11 +100,11 @@ auth_token = *[SERVICE_TOKEN_HERE]* # Query the API def query_discovery_api(auth_token, gql_query, variables): - response = requests.post('https://metadata.cloud.getdbt.com/graphql', + response = requests.post('https://metadata.cloud.getdbt.com/graphql', headers={"authorization": "Bearer "+auth_token, "content-type": "application/json"}, json={"query": gql_query, "variables": variables}) data = response.json()['data'] - + return data # Get the latest run metadata for all models @@ -120,7 +132,7 @@ variables_query_two = { } # Get the historical run metadata for the longest running model -model_historical_metadata = query_discovery_api(auth_token, query_two, variables_query_two)['modelByEnvironment'] +model_historical_metadata = query_discovery_api(auth_token, query_two, variables_query_two)['environment']['applied']['modelHistoricalRuns'] # Convert to dataframe model_df = pd.DataFrame(model_historical_metadata) @@ -143,7 +155,8 @@ plt.plot(model_df['executeStartedAt'], model_df['executionTime']) plt.title(model_df['name'].iloc[0]+" Execution Time") plt.show() ``` -Plotting examples: + +Plotting examples: @@ -152,70 +165,91 @@ Plotting examples:
    -### What’s the latest state of each model? +### What’s the latest state of each model? The Discovery API provides information about the applied state of models and how they arrived in that state. You can retrieve the status information from the most recent run and most recent successful run (execution) from the `environment` endpoint and dive into historical runs using job-based and `modelByEnvironment` endpoints.
    Example query -The API returns full identifier information (`database.schema.alias`) and the `executionInfo` for both the most recent run and most recent successful run from the database: - - - ```graphql - query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first) { - edges { - node { - uniqueId - compiledCode - database - schema - alias - materializedType - executionInfo { - executeCompletedAt - lastJobDefinitionId - lastRunGeneratedAt - lastRunId - lastRunStatus - lastRunError - lastSuccessJobDefinitionId - runGeneratedAt - lastSuccessRunId - } - } - } - } - } - } - } - ``` +The API returns full identifier information (`database.schema.alias`) and the `executionInfo` for both the most recent run and most recent successful run from the database: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + compiledCode + database + schema + alias + materializedType + executionInfo { + executeCompletedAt + lastJobDefinitionId + lastRunGeneratedAt + lastRunId + lastRunStatus + lastRunError + lastSuccessJobDefinitionId + runGeneratedAt + lastSuccessRunId + } + } + } + } + } + } +} +```
    ### What happened with my job run? -You can query the metadata at the job level to review results for specific runs. This is helpful for historical analysis of deployment performance or optimizing particular jobs. +You can query the metadata at the job level to review results for specific runs. This is helpful for historical analysis of deployment performance or optimizing particular jobs. + +import DiscoveryApiJobDeprecationNotice from '/snippets/_discovery_api_job_deprecation_notice.md'; + +
    Example query +Deprecated example: ```graphql -query($jobId: Int!, $runId: Int!){ - models(jobId: $jobId, runId: $runId) { - name - status - tests { - name - status - } - } +query ($jobId: Int!, $runId: Int!) { + models(jobId: $jobId, runId: $runId) { + name + status + tests { + name + status + } + } +} +``` + +New example: + +```graphql +query ($jobId: BigInt!, $runId: BigInt!) { + job(id: $jobId, runId: $runId) { + models { + name + status + tests { + name + status + } + } + } } ``` - +
    ### What’s changed since the last run? @@ -228,41 +262,47 @@ With the API, you can compare the `rawCode` between the definition and applied s ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first, filter: {uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - rawCode - ancestors(types: [Source]){ - ...on SourceAppliedStateNode { - freshness { - maxLoadedAt - } - } - } - executionInfo { - runGeneratedAt - executeCompletedAt - } - materializedType - } - } - } - } - definition { - models(first: $first, filter: {uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - rawCode - runGeneratedAt - materializedType - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + rawCode + ancestors(types: [Source]) { + ... on SourceAppliedStateNestedNode { + freshness { + maxLoadedAt + } + } + } + executionInfo { + runGeneratedAt + executeCompletedAt + } + materializedType + } + } + } + } + definition { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + rawCode + runGeneratedAt + materializedType + } + } + } + } + } } ``` @@ -270,45 +310,46 @@ query($environmentId: Int!, $first: Int!){ ## Quality -You can use the Discovery API to monitor data source freshness and test results to diagnose and resolve issues and drive trust in data. When used with [webhooks](/docs/deploy/webhooks), can also help with detecting, investigating, and alerting issues. Below lists example questions the API can help you answer. Below are example questions and queries you can run. +You can use the Discovery API to monitor data source freshness and test results to diagnose and resolve issues and drive trust in data. When used with [webhooks](/docs/deploy/webhooks), can also help with detecting, investigating, and alerting issues. Below lists example questions the API can help you answer. Below are example questions and queries you can run. -For quality use cases, people typically query the historical or latest applied state, often in the upstream part of the DAG (for example, sources), using the `environment` or `modelByEnvironment` endpoints. +For quality use cases, people typically query the historical or latest applied state, often in the upstream part of the DAG (for example, sources), using the `environment` or `environment { applied { modelHistoricalRuns } }` endpoints. ### Which models and tests failed to run? + By filtering on the latest status, you can get lists of models that failed to build and tests that failed during their most recent execution. This is helpful when diagnosing issues with the deployment that result in delayed or incorrect data.
    Example query with code -1. Get the latest run results across all jobs in the environment and return only the models and tests that errored/failed. +1. Get the latest run results across all jobs in the environment and return only the models and tests that errored/failed. ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first, filter: {lastRunStatus:error}) { - edges { - node { - name - executionInfo { - lastRunId - } - } - } - } - tests(first: $first, filter: {status:"fail"}) { - edges { - node { - name - executionInfo { - lastRunId - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { lastRunStatus: error }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + tests(first: $first, filter: { status: "fail" }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + } + } } ``` @@ -316,14 +357,18 @@ query($environmentId: Int!, $first: Int!){ ```graphql -query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int) { - modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId, lastRunCount: $lastRunCount) { - name - executeStartedAt - status - tests { - name - status +query ($environmentId: BigInt!, $uniqueId: String!, $lastRunCount: Int) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId, lastRunCount: $lastRunCount) { + name + executeStartedAt + status + tests { + name + status + } + } } } } @@ -337,63 +382,67 @@ query($environmentId: Int!, $uniqueId: String!, $lastRunCount: Int) { ### When was the data my model uses last refreshed? -You can get the metadata on the latest execution for a particular model or across all models in your project. For instance, investigate when each model or snapshot that's feeding into a given model was last executed or the source or seed was last loaded to gauge the _freshness_ of the data. +You can get the metadata on the latest execution for a particular model or across all models in your project. For instance, investigate when each model or snapshot that's feeding into a given model was last executed or the source or seed was last loaded to gauge the _freshness_ of the data.
    Example query with code ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first,filter:{uniqueIds:"MODEL.PROJECT.MODEL_NAME"}) { - edges { - node { - name - ancestors(types:[Model, Source, Seed, Snapshot]) { - ... on ModelAppliedStateNode { - name - resourceType - materializedType - executionInfo { - executeCompletedAt - } - } - ... on SourceAppliedStateNode { - sourceName - name - resourceType - freshness { - maxLoadedAt - } - } - ... on SnapshotAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt - } - } - ... on SeedAppliedStateNode { - name - resourceType - executionInfo { - executeCompletedAt - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + name + ancestors(types: [Model, Source, Seed, Snapshot]) { + ... on ModelAppliedStateNestedNode { + name + resourceType + materializedType + executionInfo { + executeCompletedAt + } + } + ... on SourceAppliedStateNestedNode { + sourceName + name + resourceType + freshness { + maxLoadedAt + } + } + ... on SnapshotAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + ... on SeedAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + } + } + } + } + } + } } ``` + ```python # Extract graph nodes from response -def extract_nodes(data): +def extract_nodes(data): models = [] sources = [] groups = [] @@ -422,9 +471,9 @@ def create_freshness_graph(models_df, sources_df): if model["executionInfo"]["executeCompletedAt"] is not None: model_freshness = current_time - pd.Timestamp(model["executionInfo"]["executeCompletedAt"]) for ancestor in model["ancestors"]: - if ancestor["resourceType"] == "SourceAppliedStateNode": + if ancestor["resourceType"] == "SourceAppliedStateNestedNode": ancestor_freshness = current_time - pd.Timestamp(ancestor["freshness"]['maxLoadedAt']) - elif ancestor["resourceType"] == "ModelAppliedStateNode": + elif ancestor["resourceType"] == "ModelAppliedStateNestedNode": ancestor_freshness = current_time - pd.Timestamp(ancestor["executionInfo"]["executeCompletedAt"]) if ancestor_freshness > max_freshness: @@ -437,11 +486,11 @@ def create_freshness_graph(models_df, sources_df): for _, model in models_df.iterrows(): for parent in model["parents"]: G.add_edge(parent["uniqueId"], model["uniqueId"]) - + return G ``` -Graph example: +Graph example: @@ -450,7 +499,7 @@ Graph example: ### Are my data sources fresh? -Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshness) allows you to ensure that sources loaded and used in your dbt project are compliant with expectations. The API provides the latest metadata about source loading and information about the freshness check criteria. +Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshness) allows you to ensure that sources loaded and used in your dbt project are compliant with expectations. The API provides the latest metadata about source loading and information about the freshness check criteria. @@ -458,47 +507,49 @@ Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshne Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - sources(first: $first, filters:{freshnessChecked:true, database:"production"}) { - edges { - node { - sourceName - name - identifier - loader - freshness { - freshnessJobDefinitionId - freshnessRunId - freshnessRunGeneratedAt - freshnessStatus - freshnessChecked - maxLoadedAt - maxLoadedAtTimeAgoInS - snapshottedAt - criteria { - errorAfter { - count - period - } - warnAfter { - count - period - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + sources( + first: $first + filter: { freshnessChecked: true, database: "production" } + ) { + edges { + node { + sourceName + name + identifier + loader + freshness { + freshnessJobDefinitionId + freshnessRunId + freshnessRunGeneratedAt + freshnessStatus + freshnessChecked + maxLoadedAt + maxLoadedAtTimeAgoInS + snapshottedAt + criteria { + errorAfter { + count + period + } + warnAfter { + count + period + } + } + } + } + } + } + } + } } ```
    - ### What’s the test coverage and status? [Tests](https://docs.getdbt.com/docs/build/tests) are an important way to ensure that your stakeholders are reviewing high-quality data. You can execute tests during a dbt Cloud run. The Discovery API provides complete test results for a given environment or job, which it represents as the `children` of a given node that’s been tested (for example, a `model`). @@ -506,32 +557,32 @@ query($environmentId: Int!, $first: Int!){
    Example query -For the following example, the `parents` are the nodes (code) that's being tested and `executionInfo` describes the latest test results: +For the following example, the `parents` are the nodes (code) that's being tested and `executionInfo` describes the latest test results: ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - tests(first: $first) { - edges { - node { - name - columnName - parents { - name - resourceType - } - executionInfo { - lastRunStatus - lastRunError - executeCompletedAt - executionTime - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + tests(first: $first) { + edges { + node { + name + columnName + parents { + name + resourceType + } + executionInfo { + lastRunStatus + lastRunError + executeCompletedAt + executionTime + } + } + } + } + } + } } ``` @@ -541,44 +592,41 @@ query($environmentId: Int!, $first: Int!){ ### How is this model contracted and versioned? -To enforce the shape of a model's definition, you can define contracts on models and their columns. You can also specify model versions to keep track of discrete stages in its evolution and use the appropriate one. +To enforce the shape of a model's definition, you can define contracts on models and their columns. You can also specify model versions to keep track of discrete stages in its evolution and use the appropriate one. + +
    Example query ```graphql -query{ - environment(id:123) { - definition { - models(first:100, filter:{access:public}) { - edges { - nodes { - name - latest_version - contract_enforced - constraints{ - name - type - expression - columns - } - catalog { - columns { - name - type - constraints { - name - type - expression - } - } - } - } - } - } - } - } +query { + environment(id: 123) { + applied { + models(first: 100, filter: { access: public }) { + edges { + node { + name + latestVersion + contractEnforced + constraints { + name + type + expression + columns + } + catalog { + columns { + name + type + } + } + } + } + } + } + } } ``` @@ -594,42 +642,50 @@ For discovery use cases, people typically query the latest applied or definition ### What does this dataset and its columns mean? -Query the Discovery API to map a table/view in the data platform to the model in the dbt project; then, retrieve metadata about its meaning, including descriptive metadata from its YAML file and catalog information from its YAML file and the schema. - +Query the Discovery API to map a table/view in the data platform to the model in the dbt project; then, retrieve metadata about its meaning, including descriptive metadata from its YAML file and catalog information from its YAML file and the schema.
    Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - models(first: $first, filter: {database:"analytics", schema:"prod", identifier:"customers"}) { - edges { - node { - name - description - tags - meta - catalog { - columns { - name - description - type - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { + database: "analytics" + schema: "prod" + identifier: "customers" + } + ) { + edges { + node { + name + description + tags + meta + catalog { + columns { + name + description + type + } + } + } + } + } + } + } } ```
    + + -### Which metrics are available? +### Which metrics are available? -Metric definitions are coming soon to the Discovery API with dbt v1.6. You’ll be able to query metrics using the dbt Semantic Layer, use them for documentation purposes (like for a data catalog), and calculate aggregations (like in a BI tool that doesn’t query the SL). +You can define and query metrics using the [dbt Semantic Layer](/docs/build/about-metricflow), use them for documentation purposes (like for a data catalog), and calculate aggregations (like in a BI tool that doesn’t query the SL). To learn more, refer to [Get started with MetricFlow](/docs/build/sl-getting-started).
    Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - definition { - metrics(first: $first) { - edges { - node { - name - description - type - formula - filter - tags - parents { - name - resourceType - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + metrics(first: $first) { + edges { + node { + name + description + type + formula + filter + tags + parents { + name + resourceType + } + } + } + } + } + } } ``` @@ -912,7 +952,7 @@ query($environmentId: Int!, $first: Int!){ -## Governance +## Governance You can use the Discovery API to audit data development and facilitate collaboration within and between teams. @@ -923,95 +963,98 @@ For governance use cases, people tend to query the latest definition state, ofte You can define and surface the groups each model is associated with. Groups contain information like owner. This can help you identify which team owns certain models and who to contact about them.
    -Example query +Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - model(first: $first, filter:{uniqueIds:["MODEL.PROJECT.NAME"]}) { - edges { - node { - name - description - resourceType - access - group - } - } - } - } - definition { - groups(first: $first) { - edges { - node { - name - resourceType - models { - name - } - owner_name - owner_email - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { uniqueIds: ["MODEL.PROJECT.NAME"] }) { + edges { + node { + name + description + resourceType + access + group + } + } + } + } + definition { + groups(first: $first) { + edges { + node { + name + resourceType + models { + name + } + ownerName + ownerEmail + } + } + } + } + } } ```
    ### Who can use this model? -You can enable users the ability to specify the level of access for a given model. In the future, public models will function like APIs to unify project lineage and enable reuse of models using cross-project refs. +You can enable people the ability to specify the level of access for a given model. In the future, public models will function like APIs to unify project lineage and enable reuse of models using cross-project refs.
    -Example query +Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - definition { - models(first: $first) { - edges { - node { - name - access - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first) { + edges { + node { + name + access + } + } + } + } + } } +``` --- -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - definition { - models(first: $first, filters:{access:public}) { - edges { - node { - name - } - } - } - } - } + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first, filter: { access: public }) { + edges { + node { + name + } + } + } + } + } } ```
    -## Development +## Development You can use the Discovery API to understand dataset changes and usage and gauge impacts to inform project definition. Below are example questions and queries you can run. For development use cases, people typically query the historical or latest definition or applied state across any part of the DAG using the `environment` endpoint. ### How is this model or metric used in downstream tools? -[Exposures](/docs/build/exposures) provide a method to define how a model or metric is actually used in dashboards and other analytics tools and use cases. You can query an exposure’s definition to see how project nodes are used and query its upstream lineage results to understand the state of the data used in it, which powers use cases like a freshness and quality status tile. +[Exposures](/docs/build/exposures) provide a method to define how a model or metric is actually used in dashboards and other analytics tools and use cases. You can query an exposure’s definition to see how project nodes are used and query its upstream lineage results to understand the state of the data used in it, which powers use cases like a freshness and quality status tile. @@ -1019,47 +1062,41 @@ For development use cases, people typically query the historical or latest defin
    Example query -This example reviews an exposure and the models used in it, including when they were last executed and their test results: +Below is an example that reviews an exposure and the models used in it including when they were last executed. ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - exposures(first: $first) { - edges { - node { - name - description - owner_name - url - parents { - name - resourceType - ... on ModelAppliedStateNode { - executionInfo { - executeCompletedAt - lastRunStatus - } - tests { - executionInfo { - executeCompletedAt - lastRunStatus - } - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + exposures(first: $first) { + edges { + node { + name + description + ownerName + url + parents { + name + resourceType + ... on ModelAppliedStateNestedNode { + executionInfo { + executeCompletedAt + lastRunStatus + } + } + } + } + } + } + } + } } ```
    -### How has this model changed over time? -The Discovery API provides historical information about any resource in your project. For instance, you can view how a model has evolved over time (across recent runs) given changes to its shape and contents. +### How has this model changed over time? +The Discovery API provides historical information about any resource in your project. For instance, you can view how a model has evolved over time (across recent runs) given changes to its shape and contents.
    Example query @@ -1067,54 +1104,69 @@ The Discovery API provides historical information about any resource in your pro Review the differences in `compiledCode` or `columns` between runs or plot the “Approximate Size” and “Row Count” `stats` over time: ```graphql -query(environmentId: Int!, uniqueId: String!, lastRunCount: Int!, withCatalog: Boolean!){ - modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId, lastRunCount: $lastRunCount, withCatalog: $withCatalog) { - name - compiledCode - columns { - name - } - stats { - label - value - } - } +query ( + $environmentId: BigInt! + $uniqueId: String! + $lastRunCount: Int! + $withCatalog: Boolean! +) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns( + uniqueId: $uniqueId + lastRunCount: $lastRunCount + withCatalog: $withCatalog + ) { + name + compiledCode + columns { + name + } + stats { + label + value + } + } + } + } } ```
    ### Which nodes depend on this data source? + dbt lineage begins with data sources. For a given source, you can look at which nodes are its children then iterate downstream to get the full list of dependencies. +Currently, querying beyond 1 generation (defined as a direct parent-to-child) is not supported. To see the grandchildren of a node, you need to make two queries: one to get the node and its children, and another to get the children nodes and their children.
    Example query ```graphql -query($environmentId: Int!, $first: Int!){ - environment(id: $environmentId) { - applied { - sources(first: $first, filter:{uniqueIds:["SOURCE_NAME.TABLE_NAME"]}) { - edges { - node { - loader - children { - uniqueId - resourceType - ... on ModelAppliedStateNode { - database - schema - alias - children { - uniqueId - } - } - } - } - } - } - } - } +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + sources( + first: $first + filter: { uniqueIds: ["SOURCE_NAME.TABLE_NAME"] } + ) { + edges { + node { + loader + children { + uniqueId + resourceType + ... on ModelAppliedStateNestedNode { + database + schema + alias + } + } + } + } + } + } + } } ```
    diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx new file mode 100644 index 00000000000..d1463f9e9b7 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx @@ -0,0 +1,50 @@ +--- +title: "Model Historical Runs object schema" +sidebar_label: "Model historical runs" +id: "discovery-schema-environment-applied-modelHistoricalRuns" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The model historical runs object allows you to query information about a model's run history. + +The [Example query](#example-query) illustrates a few fields you can query with the `modelHistoricalRuns` object. Refer to [Fields](#fields) to view the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `modelHistoricalRuns`, you can use the following arguments: + + + +### Example query + +You can use the `environmentId` and the model's `uniqueId` to return the model and its execution time for the last 20 times it was run, regardless of which job ran it. + +```graphql +query { + environment(id: 834) { + applied { + modelHistoricalRuns( + uniqueId: "model.marketing.customers" + lastRunCount: 20 + ) { + runId # Get historical results for a particular model + runGeneratedAt + executionTime # View build time across runs + status + tests { + name + status + executeCompletedAt + } # View test results across runs + } + } + } +} +``` + +### Fields + +When querying for `modelHistoricalRuns`, you can use the following fields: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx index 41fd5555c3f..a82bba6576d 100644 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx @@ -4,28 +4,34 @@ sidebar_label: "Environment" id: "discovery-schema-environment" --- -import { ArgsTable, SchemaTable } from "./schema"; +import { QueryArgsTable, SchemaTable } from "./schema"; -This environment object allows you to query information about a particular model based on `environmentId`. +The environment object allows you to query information about a particular model based on `environmentId`. -The [example query](#example-query) illustrates a few fields you can query in this `environment` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. +The [Example queries](#example-queries) illustrate a few fields you can query with this `environment` object. Refer to [Fields](#fields) to view the entire schema, which provides all possible fields you can query. ### Arguments When querying for `environment`, you can use the following arguments. - + +:::caution -### Example Query +dbt Labs is making changes to the Discovery API. These changes will take effect on August 15, 2023. -You can use your production environment's `id`: +The data type `Int` for `id` is being deprecated and will be replaced with `BigInt`. When the time comes, you will need to update your API call accordingly to avoid errors. +::: + +### Example queries + +You can use your production environment's `id`: ```graphql query Example { - environment(id: 834){ # Get the latest state of the production environment + environment(id: 834){ # Get the latest state of the production environment applied { # The state of an executed node as it exists as an object in the database models(first: 100){ # Pagination to ensure manageable response for large projects edges { node { @@ -34,8 +40,8 @@ query Example { executionInfo {executeCompletedAt, executionTime}, # Metadata from when the model was built tests {name, executionInfo{lastRunStatus, lastRunError}}, # Latest test results catalog {columns {name, description, type}, stats {label, value}}, # Catalog info - ancestors(types:[Source]) {name, ...on SourceAppliedStateNode {freshness{maxLoadedAt, freshnessStatus}}}, # Source freshness } - children {name, resourceType}}} # Immediate dependencies in lineage + ancestors(types:[Source]) {name, ...on SourceAppliedStateNode {freshness{maxLoadedAt, freshnessStatus}}}, # Source freshness } + children {name, resourceType}}} # Immediate dependencies in lineage totalCount } # Number of models in the project } definition { # The logical state of a given project node given its most recent manifest generated @@ -48,12 +54,50 @@ query Example { } ``` +With the deprecation of the data type `Int` for `id`, below is an example of replacing it with `BigInt`: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} + +``` + +With the deprecation of `modelByEnvironment`, below is an example of replacing it with `environment`: + +```graphql +query ($environmentId: BigInt!, $uniqueId: String) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } + } + } +} +``` + ### Fields When querying an `environment`, you can use the following fields. -When querying the `applied` field of `environment`, you can use the following fields. +When querying the `applied` field of `environment`, you can use the following fields. When querying the `definition` field of `environment`, you can use the following fields. diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposure.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-exposure.mdx deleted file mode 100644 index d74f12223c5..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposure.mdx +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: "Exposure object schema" -sidebar_label: "Exposure" -id: "discovery-schema-exposure" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The exposure object allows you to query information about a particular exposure. You can learn more about exposures [here](/docs/build/exposures). - -### Arguments - -When querying for an `exposure`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this exposure object. - -### Example Queries -#### Exposure information - -The example query below queries information about an exposure, including the owner's name and email, the url, and information about parent sources and parent models. - -```graphql -{ - exposure(jobId: 123, name: "my_awesome_exposure") { - runId - projectId - name - uniqueId - resourceType - ownerName - url - ownerEmail - parentsSources { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } - parentsModels { - uniqueId - } - } -} -``` - -### Fields -When querying for an `exposure`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposures.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-exposures.mdx deleted file mode 100644 index 5e3dcdd45a9..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-exposures.mdx +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: "Exposures object schema" -sidebar_label: "Exposures" -id: "discovery-schema-exposures" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The exposures object allows you to query information about all exposures in a given job. You can learn more about exposures [here](/docs/build/exposures). - -### Arguments - -When querying for `exposures`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this exposures object. - -### Example Queries -#### Exposures information - -The example query below queries information about all exposures in a given job, including, for each exposure, the owner's name and email, the url, and information about parent sources and parent models. - -```graphql -{ - exposures(jobId: 123) { - runId - projectId - name - uniqueId - resourceType - ownerName - url - ownerEmail - parentsSources { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } - parentsModels { - uniqueId - } - } -} -``` - -### Fields -When querying for `exposures`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx new file mode 100644 index 00000000000..58855659d05 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx @@ -0,0 +1,64 @@ +--- +title: "Exposure object schema" +sidebar_label: "Exposure" +id: "discovery-schema-job-exposure" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The exposure object allows you to query information about a particular exposure. To learn more, refer to [Add Exposures to your DAG](/docs/build/exposures). + +### Arguments + +When querying for an `exposure`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the exposure object. + +### Example query + +The example below queries information about an exposure including the owner's name and email, the URL, and information about parent sources and parent models. + +```graphql +{ + job(id: 123) { + exposure(name: "my_awesome_exposure") { + runId + projectId + name + uniqueId + resourceType + ownerName + url + ownerEmail + parentsSources { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + parentsModels { + uniqueId + } + } + } +} +``` + +### Fields +When querying for an `exposure`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx new file mode 100644 index 00000000000..b4fe027e324 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx @@ -0,0 +1,65 @@ +--- +title: "Exposures object schema" +sidebar_label: "Exposures" +id: "discovery-schema-job-exposures" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The exposures object allows you to query information about all exposures in a given job. To learn more, refer to [Add Exposures to your DAG](/docs/build/exposures). + + +### Arguments + +When querying for `exposures`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the exposures object. + +### Example query + +The example below queries information about all exposures in a given job including the owner's name and email, the URL, and information about parent sources and parent models for each exposure. + +```graphql +{ + job(id: 123) { + exposures(jobId: 123) { + runId + projectId + name + uniqueId + resourceType + ownerName + url + ownerEmail + parentsSources { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + parentsModels { + uniqueId + } + } + } +} +``` + +### Fields +When querying for `exposures`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx new file mode 100644 index 00000000000..3a8a52a19cb --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx @@ -0,0 +1,58 @@ +--- +title: "Metric object schema" +sidebar_label: "Metric" +id: "discovery-schema-job-metric" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The metric object allows you to query information about [metrics](/docs/build/metrics). + +### Arguments + +When querying for a `metric`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the metric object. + +### Example query + +The example query below outputs information about a metric. You can also add any field from the model endpoint (the example simply selects name). This includes schema, database, uniqueId, columns, and more. For details, refer to [Model object schema](/docs/dbt-cloud-apis/discovery-schema-job-model). + + +```graphql +{ + job(id: 123) { + metric(uniqueId: "metric.jaffle_shop.new_customers") { + uniqueId + name + packageName + tags + label + runId + description + type + sql + timestamp + timeGrains + dimensions + meta + resourceType + filters { + field + operator + value + } + model { + name + } + } + } +} +``` + +### Fields +When querying for a `metric`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx new file mode 100644 index 00000000000..174dd5b676a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx @@ -0,0 +1,60 @@ +--- +title: "Metrics object schema" +sidebar_label: "Metrics" +id: "discovery-schema-job-metrics" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The metrics object allows you to query information about [metrics](/docs/build/metrics). + + +### Arguments + +When querying for `metrics`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the metrics object. + +### Example query + +The example query returns information about all metrics for the given job. + +```graphql +{ + job(id: 123) { + metrics { + uniqueId + name + packageName + tags + label + runId + description + type + sql + timestamp + timeGrains + dimensions + meta + resourceType + filters { + field + operator + value + } + model { + name + } + } + } +} +``` + +### Fields +The metrics object can access the _same fields_ as the [metric node](/docs/dbt-cloud-apis/discovery-schema-job-metric). The difference is that the metrics object can output a list so instead of querying for fields for one specific metric, you can query for those parameters for all metrics in a run. + +When querying for `metrics`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx new file mode 100644 index 00000000000..abd1ca1b1d6 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx @@ -0,0 +1,91 @@ +--- +title: "Model object schema" +sidebar_label: "Model" +id: "discovery-schema-job-model" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The model object allows you to query information about a particular model in a given job. + +### Arguments + +When querying for a `model`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the model object. + +### Example query for finding parent models and sources + +The example query below uses the `parentsModels` and `parentsSources` fields to fetch information about a model’s parent models and parent sources. The jobID and uniqueID fields are placeholders that you will need to replace with your own values. + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + parentsModels { + runId + uniqueId + executionTime + } + parentsSources { + runId + uniqueId + state + } + } + } +} + +``` + +### Example query for model timing + +The example query below could be useful if you want to understand information around execution timing on a given model (start, end, completion). + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + runId + projectId + name + uniqueId + resourceType + executeStartedAt + executeCompletedAt + executionTime + } + } +} +``` + +### Example query for column-level information + +You can use the following example query to understand more about the columns of a given model. This query will only work if the job has generated documentation; that is, it will work with the command `dbt docs generate`. + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + columns { + name + index + type + comment + description + tags + meta + } + } + } +} +``` + + +### Fields + +When querying for a `model`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx new file mode 100644 index 00000000000..ee512f3cd97 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx @@ -0,0 +1,59 @@ +--- +title: "Models object schema" +sidebar_label: "Models" +id: "discovery-schema-job-models" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + + +The models object allows you to query information about all models in a given job. + +### Arguments + +When querying for `models`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the models object. + +### Example queries +The database, schema, and identifier arguments are all optional. This means that with this endpoint you can: + +- Find a specific model by providing `..` +- Find all of the models in a database and/or schema by providing `` and/or `` + +#### Find models by their database, schema, and identifier +The example query below finds a model by its unique database, schema, and identifier. + +```graphql +{ + job(id: 123) { + models(database:"analytics", schema: "analytics", identifier:"dim_customers") { + uniqueId + } + } +} +``` + +#### Find models by their schema +The example query below finds all models in this schema and their respective execution times. + +```graphql +{ + job(id: 123) { + models(schema: "analytics") { + uniqueId + executionTime + } + } +} +``` + + +### Fields +The models object can access the _same fields_ as the [Model node](/docs/dbt-cloud-apis/discovery-schema-job-model). The difference is that the models object can output a list so instead of querying for fields for one specific model, you can query for those parameters for all models within a jobID, database, and so on. + +When querying for `models`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx new file mode 100644 index 00000000000..924e3e87e91 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx @@ -0,0 +1,42 @@ +--- +title: "Seed object schema" +sidebar_label: "Seed" +id: "discovery-schema-job-seed" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The seed object allows you to query information about a particular seed in a given job. + +### Arguments + +When querying for a `seed`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the seed object. + +### Example query + +The example query below pulls relevant information about a given seed. For instance, you can view the load time. + +```graphql +{ + job(id: 123) { + seed(uniqueId: "seed.jaffle_shop.raw_customers") { + database + schema + uniqueId + name + status + error + } + } +} +``` + +### Fields + +When querying for a `seed`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx new file mode 100644 index 00000000000..6ed45216e5f --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx @@ -0,0 +1,40 @@ +--- +title: "Seeds object schema" +sidebar_label: "Seeds" +id: "discovery-schema-job-seeds" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The seeds object allows you to query information about all seeds in a given job. + +### Arguments + +When querying for `seeds`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the seeds object. + +### Example query + +The example query below pulls relevant information about all seeds in a given job. For instance, you can view load times. + +```graphql +{ + job(id: 123) { + seeds { + uniqueId + name + executionTime + status + } + } +} +``` + +### Fields + +When querying for `seeds`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx new file mode 100644 index 00000000000..a57163e0554 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx @@ -0,0 +1,49 @@ +--- +title: "Snapshots object schema" +sidebar_label: "Snapshots" +id: "discovery-schema-job-snapshots" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The snapshots object allows you to query information about all snapshots in a given job. + +### Arguments + +When querying for `snapshots`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the snapshots object. + +### Example query + +The database, schema, and identifier arguments are optional. This means that with this endpoint you can: + +- Find a specific snapshot by providing `..` +- Find all of the snapshots in a database and/or schema by providing `` and/or `` + +#### Find snapshots information for a job + +The example query returns information about all snapshots in this job. + +```graphql +{ + job(id: 123) { + snapshots { + uniqueId + name + executionTime + environmentId + executeStartedAt + executeCompletedAt + } + } +} +``` + +### Fields + +When querying for `snapshots`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx new file mode 100644 index 00000000000..972e929f4cd --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx @@ -0,0 +1,52 @@ +--- +title: "Source object schema" +sidebar_label: "Source" +id: "discovery-schema-job-source" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The source object allows you to query information about a particular source in a given job. + +### Arguments + +When querying for a `source`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the source object. + +### Example query + +The query below pulls relevant information about a given source. For instance, you can view the load time and the state (pass, fail, error) of that source. + +```graphql +{ + job(id: 123) { + source(uniqueId: "source.jaffle_shop.snowplow.event") { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + } +} +``` + +### Fields + +When querying for a `source`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx new file mode 100644 index 00000000000..97f717d269a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx @@ -0,0 +1,65 @@ +--- +title: "Sources object schema" +sidebar_label: "Sources" +id: "discovery-schema-job-sources" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The sources object allows you to query information about all sources in a given job. + +### Arguments + +When querying for `sources`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the sources object. + +### Example queries + +The database, schema, and identifier arguments are optional. This means that with this endpoint you can: + +- Find a specific source by providing `..` +- Find all of the sources in a database and/or schema by providing `` and/or `` + +#### Finding sources by their database, schema, and identifier + +The example query below finds a source by its unique database, schema, and identifier. + +```graphql +{ + job(id: 123) { + sources( + database: "analytics" + schema: "analytics" + identifier: "dim_customers" + ) { + uniqueId + } + } +} +``` + +#### Finding sources by their schema + +The example query below finds all sources in this schema and their respective states (pass, error, fail). + +```graphql +{ + job(id: 123) { + sources(schema: "analytics") { + uniqueId + state + } + } +} +``` + +### Fields + +The sources object can access the _same fields_ as the [source node](/docs/dbt-cloud-apis/discovery-schema-job-source). The difference is that the sources object can output a list so instead of querying for fields for one specific source, you can query for those parameters for all sources within a jobID, database, and so on. + +When querying for `sources`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx new file mode 100644 index 00000000000..c52aa49ab93 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx @@ -0,0 +1,43 @@ +--- +title: "Test object schema" +sidebar_label: "Test" +id: "discovery-schema-job-test" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The test object allows you to query information about a particular test. + +### Arguments + +When querying for a `test`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the test object. + +### Example query + +The example query below outputs information about a test including the state of the test result. In order of severity, the result can be one of these: "error", "fail", "warn", or "pass". + +```graphql +{ + job(id: 123) { + test(uniqueId: "test.internal_analytics.not_null_metrics_id") { + runId + accountId + projectId + uniqueId + name + columnName + state + } + } +} +``` + +### Fields + +When querying for a `test`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx new file mode 100644 index 00000000000..efcef674c55 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx @@ -0,0 +1,43 @@ +--- +title: "Tests object schema" +sidebar_label: "Tests" +id: "discovery-schema-job-tests" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The tests object allows you to query information about all tests in a given job. + +### Arguments + +When querying for `tests`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the tests object. + +### Example query + +The example query below finds all tests in this job and includes information about those tests. + +```graphql +{ + job(id: 123) { + tests { + runId + accountId + projectId + uniqueId + name + columnName + state + } + } +} +``` + +### Fields + +When querying for `tests`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx new file mode 100644 index 00000000000..bb30786e19d --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx @@ -0,0 +1,62 @@ +--- +title: "Job object schema" +sidebar_label: "Job" +id: "discovery-schema-job" +--- + +import { QueryArgsTable, SchemaTable } from "./schema"; + +The job object allows you to query information about a particular model based on `jobId` and, optionally, a `runId`. + +If you don't provide a `runId`, the API returns information on the latest runId of a job. + +The [example query](#example-query) illustrates a few fields you can query in this `job` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `job`, you can use the following arguments. + + + + +### Example Query + +You can use your production job's `id`. + +```graphql +query JobQueryExample { + # Provide runId for looking at specific run, otherwise it defaults to latest run + job(id: 940) { + # Get all models from this job's latest run + models(schema: "analytics") { + uniqueId + executionTime + } + + # Or query a single node + source(uniqueId: "source.jaffle_shop.snowplow.event") { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + } +} +``` + +### Fields +When querying an `job`, you can use the following fields. + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-metric.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-metric.mdx deleted file mode 100644 index 2280c6f7802..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-metric.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Metric object schema" -sidebar_label: "Metric" -id: "discovery-schema-metric" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The metric object allows you to query information about [metrics](/docs/build/metrics). - -### Arguments - -When querying for a `metric`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this metric object. - -### Example Queries -#### Metric information - -The example query below outputs information about a metric. Note that you can also add any field from the Model endpoint -- here we are simply selecting name. This includes schema, database, uniqueId, columns and more -- find documentation [here](/docs/dbt-cloud-apis/discovery-schema-model). - - -```graphql -{ - metric(jobId: 123, uniqueId: "metric.jaffle_shop.new_customers") { - uniqueId - name - packageName - tags - label - runId - description - type - sql - timestamp - timeGrains - dimensions - meta - resourceType - filters { - field - operator - value - } - model { - name - } - } -} - -``` - -### Fields -When querying for a `metric`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-metrics.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-metrics.mdx deleted file mode 100644 index 5242eb717dc..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-metrics.mdx +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: "Metrics object schema" -sidebar_label: "Metrics" -id: "discovery-schema-metrics" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The metrics object allows you to query information about [metrics](/docs/build/metrics). - -### Arguments - -When querying for `metrics`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this metrics object. - -### Example Queries -#### Metrics information - -The example query returns information about all metrics in this job. - -```graphql -{ - metrics(jobId: 123) { - uniqueId - name - packageName - tags - label - runId - description - type - sql - timestamp - timeGrains - dimensions - meta - resourceType - filters { - field - operator - value - } - model { - name - } - } -} - -``` - -### Fields -metrics has access to the *same fields* as the [metric node](/docs/dbt-cloud-apis/discovery-schema-metric). The difference is that metrics can output a list, so instead of querying for fields for one specific metric, you can query for those parameters for all metrics in a run. - -When querying for `metrics`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-model.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-model.mdx deleted file mode 100644 index 3fb43edaded..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-model.mdx +++ /dev/null @@ -1,84 +0,0 @@ ---- -title: "Model object schema" -sidebar_label: "Model" -id: "discovery-schema-model" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The model object allows you to query information about a particular model in a given job. - -### Arguments - -When querying for a `model`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this model object. - -### Example Queries -#### Finding parent models and sources - -The example query below uses the `parentsModels` and `parentsSources` fields to fetch information about a model’s parent models and parent sources. Note that we put a placeholder jobID and uniqueID, which you will have to replace. - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - parentsModels { - runId - uniqueId - executionTime - } - parentsSources { - runId - uniqueId - state - } - } -} -``` - -#### Model Timing - -The example query below could be useful if we wanted to understand information around execution timing on a given model (start, end, completion). - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - runId - projectId - name - uniqueId - resourceType - executeStartedAt - executeCompletedAt - executionTime - } -} -``` - -#### Column-level information - -You can use the following example query to understand more about the columns of a given model. Note that this will only work if the job has generated documentation. For example it will work with the command `dbt docs generate`. - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - columns{ - name - index - type - comment - description - tags - meta - } - } -} -``` - - -### Fields -When querying for a `model`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-modelByEnv.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-modelByEnv.mdx deleted file mode 100644 index 078d2512256..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-modelByEnv.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Model by environment object schema" -sidebar_label: "Model by environment" -id: "discovery-schema-modelByEnv" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - - -This model by environment object allows you to query information about a particular model based on `environmentId`. - -The [example query](#example-query) illustrates a few fields you can query in this `modelByEnvironment` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. - -### Arguments - -When querying for `modelByEnvironment`, you can use the following arguments. - - - - -### Example Query - -You can use the `environment_id` and `model_unique_id` to return the model and its execution time for the last 20 times it was run, regardless of which job ran it. - - -```graphql -query{ - modelByEnvironment(environmentId: 834, uniqueId: "model.marketing.customers", lastRunCount: 20) { - runId, # Get historical results for a particular model - runGeneratedAt, - executionTime, # View build time across runs - status, - tests { name, status, executeCompletedAt } # View test results across runs - } -} -``` - -### Fields -When querying for `modelByEnvironment`, you can use the following fields. - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-models.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-models.mdx deleted file mode 100644 index a3215eee039..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-models.mdx +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Models object schema" -sidebar_label: "Models" -id: "discovery-schema-models" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - -The models object allows you to query information about all models in a given job. - -### Arguments - -When querying for `models`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this models object. - -### Example Queries -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific model by providing `..` -- Find all of the models in a database and/or schema by providing `` and/or `` - -#### Finding models by their database, schema, and identifier -The example query below finds a model by its unique database, schema, and identifier. - -```graphql -{ - models(jobId: 123, database:"analytics", schema: "analytics", identifier:"dim_customers") { - uniqueId - } -} -``` - -#### Finding models by their schema -The example query below finds all models in this schema, and their respective execution times. - -```graphql -{ - models(jobId: 123, schema: "analytics") { - uniqueId - executionTime - } -} -``` - - -### Fields -Models has access to the *same fields* as the [Model node](/docs/dbt-cloud-apis/discovery-schema-model). The difference is that Models can output a list, so instead of querying for fields for one specific model, you can query for those parameters for all models within a jobID, database, etc. - -When querying for `models`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-seed.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-seed.mdx deleted file mode 100644 index 1047545a8be..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-seed.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Seed object schema" -sidebar_label: "Seed" -id: "discovery-schema-seed" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The seed object allows you to query information about a particular seed in a given job. - -### Arguments - -When querying for a `seed`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this seed object. - -### Example Queries - -#### Seed information - -The query below pulls relevant information about a given seed. For example, we could see the load time. - -```graphql -{ - seed(jobId: 123, uniqueId: "seed.jaffle_shop.raw_customers") { - database - schema - uniqueId - name - status - error - } -} -``` - -### Fields - -When querying for a `seed`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-seeds.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-seeds.mdx deleted file mode 100644 index 2cee2b8aa3f..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-seeds.mdx +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: "Seeds object schema" -sidebar_label: "Seeds" -id: "discovery-schema-seeds" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The seeds object allows you to query information about a all seeds in a given job. - -### Arguments - -When querying for `seeds`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this seeds object. - -### Example Queries -#### Seeds information - -The query below pulls relevant information about all seeds in a given job. For example, we could see the load times. - -```graphql -{ - seeds(jobId: 123) { - uniqueId - name - executionTime - status - } -} -``` - -### Fields - -When querying for `seeds`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-snapshots.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-snapshots.mdx deleted file mode 100644 index b3f7071319f..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-snapshots.mdx +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: "Snapshots object schema" -sidebar_label: "Snapshots" -id: "discovery-schema-snapshots" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The snapshots object allows you to query information about all snapshots in a given job. - -### Arguments - -When querying for `snapshots`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this snapshots object. - -### Example Query -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific snapshot by providing `..` -- Find all of the snapshots in a database and/or schema by providing `` and/or `` - -#### Finding snapshots information for a job -The example query returns information about all snapshots in this job. - -```graphql -{ - snapshots(jobId: 123) { - uniqueId - name - executionTime - environmentId - executeStartedAt - executeCompletedAt - } -} - -``` - -### Fields -Snapshots has access to the *same fields* as the [Snapshot node](/docs/dbt-cloud-apis/discovery-schema-snapshots). The difference is that Snapshots can output a list, so instead of querying for fields for one specific snapshot, you can query for those parameters for all snapshots within a jobID, database, etc. - -When querying for `snapshots`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-source.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-source.mdx deleted file mode 100644 index 87d776282fe..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-source.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Source object schema" -sidebar_label: "Source" -id: "discovery-schema-source" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The source object allows you to query information about a particular source in a given job. - -### Arguments - -When querying for a `source`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this source object. - -### Example Queries - -#### Source information - -The query below pulls relevant information about a given source. For example, we could see the load time and the state (“pass”, “fail”, “error”) of that source. - -```graphql -{ - source(jobId: 123, uniqueId: "source.jaffle_shop.snowplow.event") { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } -} -``` - -### Fields - -When querying for a `source`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-sources.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-sources.mdx deleted file mode 100644 index a719c5caf92..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-sources.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: "Sources object schema" -sidebar_label: "Sources" -id: "discovery-schema-sources" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - -The sources object allows you to query information about all sources in a given job. - -### Arguments - -When querying for `sources`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this sources object. - -### Example Queries -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific source by providing `..` -- Find all of the sources in a database and/or schema by providing `` and/or `` - -#### Finding sources by their database, schema, and identifier -The example query below finds a source by its unique database, schema, and identifier. - -```graphql -{ - sources(jobId: 123, database:"analytics", schema: "analytics", identifier:"dim_customers") { - uniqueId - } -} -``` - -#### Finding sources by their schema -The example query below finds all sources in this schema, and their respective states (pass, error, fail). - -```graphql -{ - sources(jobId: 123, schema: "analytics") { - uniqueId - state - } -} -``` - -### Fields -Sources has access to the *same fields* as the [Source node](/docs/dbt-cloud-apis/discovery-schema-source). The difference is that Sources can output a list, so instead of querying for fields for one specific source, you can query for those parameters for all sources within a jobID, database, etc. - -When querying for `sources`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-test.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-test.mdx deleted file mode 100644 index 2ee915d27c7..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-test.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Test object schema" -sidebar_label: "Test" -id: "discovery-schema-test" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The test object allows you to query information about a particular test. - -### Arguments - -When querying for a `test`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this test object. - -### Example Queries -#### Test result - -The example query below outputs information about a test, including the state of the test result. This can be one of, in order of severity, "error", "fail", "warn", "pass." - -```graphql -{ - test(jobId: 123, uniqueId: "test.internal_analytics.not_null_metrics_id") { - runId - accountId - projectId - uniqueId - name - columnName - state - } -} -``` - -### Fields -When querying for a `test`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-tests.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-tests.mdx deleted file mode 100644 index 7f087c85fee..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-discovery-tests.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: "Tests object schema" -sidebar_label: "Tests" -id: "discovery-schema-tests" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The tests object allows you to query information about all tests in a given job. - - -### Arguments - -When querying for `tests`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this tests object. - -### Example Queries -#### Tests result - -The example query below finds all tests in this job, and includes information about those tests. - -```graphql -{ - tests(jobId: 123) { - runId - accountId - projectId - uniqueId - name - columnName - state - } -} -``` - -### Fields -When querying for `tests`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema.jsx b/website/docs/docs/dbt-cloud-apis/schema.jsx index 8b9bbc358f0..31568671573 100644 --- a/website/docs/docs/dbt-cloud-apis/schema.jsx +++ b/website/docs/docs/dbt-cloud-apis/schema.jsx @@ -1,9 +1,55 @@ -import React, { setState } from "react"; +import React from "react"; import { useState, useEffect } from 'react' -const queriesQuery = `{ + +const getTypeString = (typeStructure) => { + // Helper function to represent GraphQL type + if (!typeStructure) return '' + + if (typeStructure.kind === 'NON_NULL') { + return `${getTypeString(typeStructure.ofType)}!`; + } else if (typeStructure.kind === 'LIST') { + return `[${getTypeString(typeStructure.ofType)}]`; + } else if (['OBJECT', 'SCALAR', 'ENUM'].includes(typeStructure.kind)) { + return `${typeStructure.name}${getTypeString(typeStructure.ofType)}`; + } else { + return ''; + } +} + +export const ArgsTable = ({ data, name }) => { + return ( + + + + + + + + + + + {data.fields.find(d => d.name === name).args.map(function ({ name, description, type }) { + return ( + + + + + + + ) + })} + +
    FieldTypeRequired?Description
    {name}{getTypeString(type)}{type.kind === 'NON_NULL' ? `Yes` : `No`}{description || `No description provided`}
    + ) +} + +const metadataUrl = 'https://metadata.cloud.getdbt.com/graphql' +const metadataBetaUrl = 'https://metadata.cloud.getdbt.com/beta/graphql' + +const queryArgsQuery = `{ __schema { queryType { - fields { + fields(includeDeprecated: true) { name type { name @@ -18,23 +64,22 @@ const queriesQuery = `{ name description kind - ofType { name description } + ofType { kind name description } } } } } } }` -const metadataUrl = 'https://metadata.cloud.getdbt.com/graphql' -const metadataBetaUrl = 'https://metadata.cloud.getdbt.com/beta/graphql' -export const ArgsTable = ({ queryName, useBetaAPI }) => { + +export const QueryArgsTable = ({ queryName, useBetaAPI }) => { const [data, setData] = useState(null) useEffect(() => { const fetchData = () => { fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query: queriesQuery }), + body: JSON.stringify({ query: queryArgsQuery }), }) .then((result) => result.json()) .then((data) => setData(data)) @@ -45,33 +90,89 @@ export const ArgsTable = ({ queryName, useBetaAPI }) => { return

    Fetching data...

    } return ( - - - - - - - - - - - {data.data.__schema.queryType.fields.find(d => d.name === queryName).args.map(function ({ name, description, type }) { - return ( - - - {type.ofType ? - : - + + ) +} + +export const NodeArgsTable = ({ parent, name, useBetaAPI }) => { + const [data, setData] = useState(null) + useEffect(() => { + const fetchData = () => { + fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query: ` + query { + __type(name: "${parent}") { + ...FullType + } + } + + fragment FullType on __Type { + kind + fields(includeDeprecated: true) { + name + description + args { + name + description + defaultValue + type { + ...TypeRef + } } - - - - ) - })} - -
    FieldTypeRequired?Description
    {name}{type.ofType.name}{type.name}{type.kind === 'NON_NULL' ? `Yes` : `No`}{description || `No description provided`}
    + } + } + + # get several levels + fragment TypeRef on __Type { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + } + } + } + } + } + } + } + } + `}) + }) + .then((result) => result.json()) + .then((data) => setData(data)) + } + fetchData() + }, []) + if (!data) { + return

    Fetching data...

    + } + return ( + ) } + export const SchemaTable = ({ nodeName, useBetaAPI }) => { const [data, setData] = useState(null) useEffect(() => { @@ -80,27 +181,60 @@ export const SchemaTable = ({ nodeName, useBetaAPI }) => { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify({ - query: `{ - __type(name: "${nodeName}") { - fields { + query: ` + query { + __type(name: "${nodeName}") { + ...FullType + } + } + + fragment FullType on __Type { + kind + name + description + fields(includeDeprecated: true) { name description - type { - name - description - kind - ofType { - name - description - ofType { - name - description - } - } + type { + ...TypeRef } } } - }`}), + + # get several levels + fragment TypeRef on __Type { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + } + } + } + } + } + } + } + } + `}), }) .then((result) => result.json()) .then((data) => setData(data)) @@ -124,13 +258,7 @@ export const SchemaTable = ({ nodeName, useBetaAPI }) => { return ( {name} - {type.kind === 'LIST' ? - [{type.ofType.ofType ? type.ofType.ofType.name : type.ofType.name}] : - (type.ofType ? - {type.ofType.name} : - {type.name} - ) - } + {getTypeString(type)} {description} ) @@ -138,4 +266,4 @@ export const SchemaTable = ({ nodeName, useBetaAPI }) => { ) -} \ No newline at end of file +} diff --git a/website/docs/docs/dbt-cloud-apis/service-tokens.md b/website/docs/docs/dbt-cloud-apis/service-tokens.md index 811bfaea29d..9553f48a013 100644 --- a/website/docs/docs/dbt-cloud-apis/service-tokens.md +++ b/website/docs/docs/dbt-cloud-apis/service-tokens.md @@ -9,8 +9,6 @@ If you have service tokens created on or before July 18, 2023, please read [this ::: -## About service tokens - Service account tokens enable you to securely authenticate with the dbt Cloud API by assigning each token a narrow set of permissions that more precisely manages access to the API. While similar to [User API tokens](user-tokens), service account tokens belong to an account rather than a user. You can use service account tokens for system-level integrations that do not run on behalf of any one user. Assign any permission sets available in dbt Cloud to your service account token, which can vary slightly depending on your plan: @@ -20,9 +18,9 @@ You can use service account tokens for system-level integrations that do not run You can assign as many permission sets as needed to one token. For more on permissions sets, see "[Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions)." -## Generating service account tokens +## Generate service account tokens -To make a service token in dbt Cloud, follow these steps: +You can generate service tokens if you have a Developer [license](/docs/cloud/manage-access/seats-and-users) and account admin [permissions](/docs/cloud/manage-access/about-user-access#permission-sets). To create a service token in dbt Cloud, follow these steps: 1. Open the **Account Settings** page by clicking the gear icon on the right-hand side. 2. On the left sidebar, click on **Service Tokens**. @@ -43,6 +41,9 @@ Account Admin service tokens have full `read + write` access to an account, so p **Metadata Only**
    Metadata-only service tokens authorize requests to the Discovery API. +**Semantic Layer Only**
    +Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. + **Job Admin**
    Job admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. @@ -68,6 +69,9 @@ Billing Admin service tokens have certain account-level permissions. For more o **Metadata Only**
    Metadata-only service tokens authorize requests to the Discovery API. +**Semantic Layer Only**
    +Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. + **Job Admin**
    Job Admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. For more on these permissions, see [Job Admin](/docs/cloud/manage-access/enterprise-permissions#job-admin). diff --git a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md new file mode 100644 index 00000000000..42416765904 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md @@ -0,0 +1,62 @@ +--- +title: "Semantic Layer APIs" +id: sl-api-overview +description: "Integrate and query metrics and dimensions in downstream tools using the Semantic Layer APIs" +tags: [Semantic Layer, API] +hide_table_of_contents: true +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +The rapid growth of different tools in the modern data stack has helped data professionals address the diverse needs of different teams. The downside of this growth is the fragmentation of business logic across teams, tools, and workloads. + +The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) allows you to define metrics in code (with [MetricFlow](/docs/build/about-metricflow)) and dynamically generate and query datasets in downstream tools based on their dbt governed assets, such as metrics and models. Integrating with the dbt Semantic Layer will help organizations that use your product make more efficient and trustworthy decisions with their data. It also helps you to avoid duplicative coding, optimize development workflow, ensure data governance, and guarantee consistency for data consumers. + +You can use the dbt Semantic Layer for a variety of tools and applications of data. Some common use cases are: + +* Business intelligence (BI), reporting, and analytics +* Data quality and monitoring +* Governance and privacy +* Data discovery and cataloging +* Machine learning and data science + + + +import Features from '/snippets/_sl-plan-info.md' + + + +
    + + + + + + + +
    + + diff --git a/website/docs/docs/dbt-cloud-apis/sl-graphql.md b/website/docs/docs/dbt-cloud-apis/sl-graphql.md new file mode 100644 index 00000000000..3e06df69f76 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-graphql.md @@ -0,0 +1,462 @@ +--- +title: "GraphQL" +id: sl-graphql +description: "Integrate and use the GraphQL API to query your metrics." +tags: [Semantic Layer, APIs] +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + + +[GraphQL](https://graphql.org/) (GQL) is an open-source query language for APIs. It offers a more efficient and flexible approach compared to traditional RESTful APIs. + +With GraphQL, users can request specific data using a single query, reducing the need for many server round trips. This improves performance and minimizes network overhead. + +GraphQL has several advantages, such as self-documenting, having a strong typing system, supporting versioning and evolution, enabling rapid development, and having a robust ecosystem. These features make GraphQL a powerful choice for APIs prioritizing flexibility, performance, and developer productivity. + +## dbt Semantic Layer GraphQL API + +The dbt Semantic Layer GraphQL API allows you to explore and query metrics and dimensions. Due to its self-documenting nature, you can explore the calls conveniently through the [schema explorer](https://semantic-layer.cloud.getdbt.com/api/graphql). + +dbt Partners can use the Semantic Layer GraphQL API to build an integration with the dbt Semantic Layer. + +## Requirements to use the GraphQL API +- A dbt Cloud project on dbt v1.6 or higher +- Metrics are defined and configured +- A dbt Cloud [service token](/docs/dbt-cloud-apis/service-tokens) with "Semantic Layer Only” and "Metadata Only" permissions +- Your dbt project is configured and connected to a data platform + + +## Using the GraphQL API + +If you're a dbt user or partner with access to dbt Cloud and the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), you can [setup](/docs/use-dbt-semantic-layer/setup-sl) and test this API with data from your own instance by configuring the Semantic Layer and obtaining the right GQL connection parameters described in this document. + +Refer to [Get started with the dbt Semantic Layer](docs/use-dbt-semantic-layer/quickstart-sl) for more info. + + +### Authentication + +Authentication uses a dbt Cloud [service account tokens](/docs/dbt-cloud-apis/service-tokens) passed through a header as follows. To explore the schema, you can enter this information in the "header" section. + +``` +{"Authorization": "Bearer "} +``` + +Each GQL request also requires a dbt Cloud `environmentId`. The API uses both the service token in the header and environmentId for authentication. + +### Metadata calls + +**Fetch data platform dialect** + +In some cases in your application, it may be useful to know the dialect or data platform that's internally used for the dbt Semantic Layer connection (such as if you are building `where` filters from a user interface rather than user-inputted SQL). + +The GraphQL API has an easy way to fetch this with the following query: + +```graphql +{ + environmentInfo(environmentId: BigInt!) { + dialect + } +} +``` + +**Fetch available metrics** + +```graphql +metrics(environmentId: BigInt!): [Metric!]! +``` + +**Fetch available dimensions for metrics** + +```graphql +dimensions( + environmentId: BigInt! + metrics: [MetricInput!]! +): [Dimension!]! +``` + +**Fetch available granularities given metrics** + +Note: This call for `queryableGranularities` returns only queryable granularities for metric time - the primary time dimension across all metrics selected. + +```graphql +queryableGranularities( + environmentId: BigInt! + metrics: [MetricInput!]! +): [TimeGranularity!]! +``` + +You can also get queryable granularities for all other dimensions using the `dimensions` call: + +```graphql +{ + dimensions(environmentId: BigInt!, metrics:[{name:"order_total"}]) { + name + queryableGranularities # --> ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"] + } +} +``` + +You can also optionally access it from the metrics endpoint: + +```graphql +{ + metrics(environmentId: BigInt!) { + name + dimensions { + name + queryableGranularities + } + } +} +``` + +**Fetch measures** + +```graphql +{ + measures(environmentId: BigInt!, metrics: [{name:"order_total"}]) { + name + aggTimeDimension + } +} +``` + +`aggTimeDimension` tells you the name of the dimension that maps to `metric_time` for a given measure. You can also query `measures` from the `metrics` endpoint, which allows you to see what dimensions map to `metric_time` for a given metric: + +```graphql +{ + metrics(environmentId: BigInt!) { + measures { + name + aggTimeDimension + } + } +} +``` + +**Fetch available metrics given a set of dimensions** + +```graphql +metricsForDimensions( + environmentId: BigInt! + dimensions: [GroupByInput!]! +): [Metric!]! +``` + +**Create Dimension Values query** + +```graphql + +mutation createDimensionValuesQuery( + environmentId: BigInt! + metrics: [MetricInput!] + groupBy: [GroupByInput!]! +): CreateDimensionValuesQueryResult! + +``` + +**Create Metric query** + +```graphql +createQuery( + environmentId: BigInt! + metrics: [MetricInput!]! + groupBy: [GroupByInput!] = null + limit: Int = null + where: [WhereInput!] = null + order: [OrderByInput!] = null +): CreateQueryResult +``` + +```graphql +MetricInput { + name: String! +} + +GroupByInput { + name: String! + grain: TimeGranularity = null +} + +WhereInput { + sql: String! +} + +OrderByinput { # -- pass one and only one of metric or groupBy + metric: MetricInput = null + groupBy: GroupByInput = null + descending: Boolean! = false +} +``` + +**Fetch query result** + +```graphql +query( + environmentId: BigInt! + queryId: String! +): QueryResult! +``` + +**Metric Types** + +```graphql +Metric { + name: String! + description: String + type: MetricType! + typeParams: MetricTypeParams! + filter: WhereFilter + dimensions: [Dimension!]! + queryableGranularities: [TimeGranularity!]! +} +``` + +``` +MetricType = [SIMPLE, RATIO, CUMULATIVE, DERIVED] +``` + +**Metric Type parameters** + +```graphql +MetricTypeParams { + measure: MetricInputMeasure + inputMeasures: [MetricInputMeasure!]! + numerator: MetricInput + denominator: MetricInput + expr: String + window: MetricTimeWindow + grainToDate: TimeGranularity + metrics: [MetricInput!] +} +``` + + +**Dimension Types** + +```graphql +Dimension { + name: String! + description: String + type: DimensionType! + typeParams: DimensionTypeParams + isPartition: Boolean! + expr: String + queryableGranularities: [TimeGranularity!]! +} +``` + +``` +DimensionType = [CATEGORICAL, TIME] +``` + +### Create Query examples + +The following section provides query examples for the GraphQL API, such as how to query metrics, dimensions, where filters, and more. + +**Query two metrics grouped by time** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "food_order_amount"}] + groupBy: [{name: "metric_time}, {name: "customer__customer_type"}] + ) { + queryId + } +} +``` + +**Query with a time grain** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "order_total"}] + groupBy: [{name: "metric_time", grain: "month"}] + ) { + queryId + } +} +``` + +Note that when using granularity in the query, the output of a time dimension with a time grain applied to it always takes the form of a dimension name appended with a double underscore and the granularity level - `{time_dimension_name}__{DAY|WEEK|MONTH|QUARTER|YEAR}`. Even if no granularity is specified, it will also always have a granularity appended to it and will default to the lowest available (usually daily for most data sources). It is encouraged to specify a granularity when using time dimensions so that there won't be any unexpected results with the output data. + +**Query two metrics with a categorical dimension** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "food_order_amount"}, {name: "order_gross_profit"}] + groupBy: [{name: "metric_time, grain: "month"}, {name: "customer__customer_type"}] + ) { + queryId + } +} +``` + +**Query with a where filter** + +The `where` filter takes a list argument (or a string for a single input). Depending on the object you are filtering, there are a couple of parameters: + + - `Dimension()` — Used for any categorical or time dimensions. If used for a time dimension, granularity is required. For example, `Dimension('metric_time').grain('week')` or `Dimension('customer__country')`. + +- `Entity()` — Used for entities like primary and foreign keys, such as `Entity('order_id')`. + +Note: If you prefer a more strongly typed `where` clause, you can optionally use `TimeDimension()` to separate out categorical dimensions from time ones. The `TimeDimension` input takes the time dimension name and also requires granularity. For example, `TimeDimension('metric_time', 'MONTH')`. + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics:[{name: "order_total"}] + groupBy:[{name: "customer__customer_type"}, {name: "metric_time", grain: "month"}] + where:[{sql: "{{ Dimension('customer__customer_type') }} = 'new'"}, {sql:"{{ Dimension('metric_time').grain('month') }} > '2022-10-01'"}] + ) { + queryId + } +} +``` + +**Query with Order** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "order_total"}] + groupBy: [{name: "metric_time", grain: "month"}] + orderBy: [{metric: {name: "order_total"}}, {groupBy: {name: "metric_time", grain: "month"}, descending:true}] + ) { + queryId + } +} +``` + + +**Query with Limit** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name:"food_order_amount"}, {name: "order_gross_profit"}] + groupBy: [{name:"metric_time, grain: "month"}, {name: "customer__customer_type"}] + limit: 10 + ) { + queryId + } +} +``` + +**Query with Explain** + +This takes the same inputs as the `createQuery` mutation. + +```graphql +mutation { + compileSql( + environmentId: BigInt! + metrics: [{name:"food_order_amount"} {name:"order_gross_profit"}] + groupBy: [{name:"metric_time, grain:"month"}, {name:"customer__customer_type"}] + ) { + sql + } +} +``` + +### Output format and pagination + +**Output format** + +By default, the output is in Arrow format. You can switch to JSON format using the following parameter. However, due to performance limitations, we recommend using the JSON parameter for testing and validation. The JSON received is a base64 encoded string. To access it, you can decode it using a base64 decoder. The JSON is created from pandas, which means you can change it back to a dataframe using `pandas.read_json(json, orient="table")`. Or you can work with the data directly using `json["data"]`, and find the table schema using `json["schema"]["fields"]`. Alternatively, you can pass `encoded:false` to the jsonResult field to get a raw JSON string directly. + + +```graphql +{ + query(environmentId: BigInt!, queryId: Int!, pageNum: Int! = 1) { + sql + status + error + totalPages + arrowResult + jsonResult(orient: PandasJsonOrient! = TABLE, encoded: Boolean! = true) + } +} +``` + +The results default to the table but you can change it to any [pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) supported value. + +**Pagination** + +By default, we return 1024 rows per page. If your result set exceeds this, you need to increase the page number using the `pageNum` option. + +### Run a Python query + +The `arrowResult` in the GraphQL query response is a byte dump, which isn't visually useful. You can convert this byte data into an Arrow table using any Arrow-supported language. Refer to the following Python example explaining how to query and decode the arrow result: + + +```python +import base64 +import pyarrow as pa + +headers = {"Authorization":"Bearer "} +query_result_request = """ +{ + query(environmentId: 70, queryId: "12345678") { + sql + status + error + arrowResult + } +} +""" + +gql_response = requests.post( + "http://localhost:8000/graphql", + json={"query": query_result_request}, + headers=headers, +) + +""" +gql_response.json() => +{ + "data": { + "query": { + "sql": "SELECT\n ordered_at AS metric_time__day\n , SUM(order_total) AS order_total\nFROM semantic_layer.orders orders_src_1\nGROUP BY\n ordered_at", + "status": "SUCCESSFUL", + "error": null, + "arrowResult": "arrow-byte-data" + } + } +} +""" + +def to_arrow_table(byte_string: str) -> pa.Table: + """Get a raw base64 string and convert to an Arrow Table.""" + with pa.ipc.open_stream(base64.b64decode(res)) as reader: + return pa.Table.from_batches(reader, reader.schema) + + +arrow_table = to_arrow_table(gql_response.json()["data"]["query"]["arrowResult"]) + +# Perform whatever functionality is available, like convert to a pandas table. +print(arrow_table.to_pandas()) +""" +order_total ordered_at + 3 2023-08-07 + 112 2023-08-08 + 12 2023-08-09 + 5123 2023-08-10 +""" +``` diff --git a/website/docs/docs/dbt-cloud-apis/sl-jdbc.md b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md new file mode 100644 index 00000000000..02d26229794 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md @@ -0,0 +1,316 @@ +--- +title: "JDBC" +id: sl-jdbc +description: "Integrate and use the JDBC API to query your metrics." +tags: [Semantic Layer, API] +--- + + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +The dbt Semantic Layer Java Database Connectivity (JDBC) API enables users to query metrics and dimensions using the JDBC protocol, while also providing standard metadata functionality. + +A JDBC driver is a software component enabling a Java application to interact with a data platform. Here's some more information about our JDBC API: + +- The Semantic Layer JDBC API utilizes the open-source JDBC driver with ArrowFlight SQL protocol. +- You can download the JDBC driver from [Maven](https://search.maven.org/remotecontent?filepath=org/apache/arrow/flight-sql-jdbc-driver/12.0.0/flight-sql-jdbc-driver-12.0.0.jar). +- The dbt Semantic Layer supports ArrowFlight SQL driver version 12.0.0 and higher. +- You can embed the driver into your application stack as needed, and you can use dbt Labs' [example project](https://github.com/dbt-labs/example-semantic-layer-clients) for reference. +- If you’re a partner or user building a homegrown application, you’ll need to install an AWS root CA to the Java Trust [documentation](https://www.amazontrust.com/repository/) (specific to Java and JDBC call). + +dbt Labs partners can use the JDBC API to build integrations in their tools with the dbt Semantic Layer + +## Using the JDBC API + +If you are a dbt user or partner with access to dbt Cloud and the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), you can [setup](/docs/use-dbt-semantic-layer/setup-sl) and test this API with data from your own instance by configuring the Semantic Layer and obtaining the right JDBC connection parameters described in this document. + +You *may* be able to use our JDBC API with tools that do not have an official integration with the dbt Semantic Layer. If the tool you use allows you to write SQL and either supports a generic JDBC driver option (such as DataGrip) or supports Dremio and uses ArrowFlightSQL driver version 12.0.0 or higher, you can access the Semantic Layer API. + +Refer to [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) for more info. + +## Authentication + +dbt Cloud authorizes requests to the dbt Semantic Layer API. You need to provide an environment ID, host, and [service account tokens](/docs/dbt-cloud-apis/service-tokens). + +## Connection parameters + +The JDBC connection requires a few different connection parameters. + +This is an example of a URL connection string and the individual components: + +``` +jdbc:arrow-flight-sql://semantic-layer.cloud.getdbt.com:443?&environmentId=202339&token=SERVICE_TOKEN +``` + +| JDBC parameter | Description | Example | +| -------------- | ----------- | ------- | +| `jdbc:arrow-flight-sql://` | The protocol for the JDBC driver. | `jdbc:arrow-flight-sql://` | +| `semantic-layer.cloud.getdbt.com` | The [access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your account's dbt Cloud region. You must always add the `semantic-layer` prefix before the access URL. | For dbt Cloud deployment hosted in North America, use `semantic-layer.cloud.getdbt.com` | +| `environmentId` | The unique identifier for the dbt production environment, you can retrieve this from the dbt Cloud URL
    when you navigate to **Environments** under **Deploy**. | If your URL ends with `.../environments/222222`, your `environmentId` is `222222`

    | +| `SERVICE_TOKEN` | dbt Cloud [service token](/docs/dbt-cloud-apis/service-tokens) with “Semantic Layer Only” and "Metadata Only" permissions. Create a new service token on the **Account Settings** page. | `token=SERVICE_TOKEN` | + +*Note — If you're testing locally on a tool like DataGrip, you may also have to provide the following variable at the end or beginning of the JDBC URL `&disableCertificateVerification=true`. + +## Querying the API for metric metadata + +The Semantic Layer JDBC API has built-in metadata calls which can provide a user with information about their metrics and dimensions. Here are some metadata commands and examples: + + + + + +Use this query to fetch all defined metrics in your dbt project: + +```bash +select * from {{ + semantic_layer.metrics() +}} +``` + + + + +Use this query to fetch all dimensions for a metric. + +Note, `metrics` is a required argument that lists one or multiple metrics in it. + +```bash +select * from {{ + semantic_layer.dimensions(metrics=['food_order_amount'])}} +``` + + + + + +Use this query to fetch dimension values for one or multiple metrics and single dimension. + +Note, `metrics` is a required argument that lists one or multiple metrics in it, and a single dimension. + +```bash +select * from {{ +semantic_layer.dimension_values(metrics=['food_order_amount'], group_by=['customer__customer_name'])}} +``` + + + + + +Use this query to fetch queryable granularities for a list of metrics. This API request allows you to only show the time granularities that make sense for the primary time dimension of the metrics (such as `metric_time`), but if you want queryable granularities for other time dimensions, you can use the `dimensions()` call, and find the column queryable_granularities. + +Note, `metrics` is a required argument that lists one or multiple metrics in it. + +```bash +select * from {{ + semantic_layer.queryable_granularities(metrics=['food_order_amount', 'order_gross_profit'])}} +``` + + + + + + +Use this query to fetch available metrics given dimensions. This command is essentially the opposite of getting dimensions given a list of metrics. + +Note, `group_by` is a required argument that lists one or multiple dimensions in it. + +```bash +select * from {{ + semantic_layer.metrics_for_dimensions(group_by=['customer__customer_type']) + +}} +``` + + + + + +Use this example query to fetch available granularities for all time dimesensions (the similar queryable granularities API call only returns granularities for the primary time dimensions for metrics). The following call is a derivative of the `dimensions()` call and specifically selects the granularities field. + +```bash +select NAME, QUERYABLE_GRANULARITIES from {{ + semantic_layer.dimensions( + metrics=["order_total"] + ) +}} + +``` + + + + + +It may be useful in your application to expose the names of the time dimensions that represent `metric_time` or the common thread across all metrics. +You can first query the `metrics()` argument to fetch a list of measures, then use the `measures()` call which will return the name(s) of the time dimensions that make up metric time. + +```bash +select * from {{ + semantic_layer.measures(metrics=['orders']) +}} +``` + + + + +## Querying the API for metric values + +To query metric values, here are the following parameters that are available: + +| Parameter | Description | Example | Type | +| --------- | -----------| ------------ | -------------------- | +| `metrics` | The metric name as defined in your dbt metric configuration | `metrics=['revenue']` | Required | +| `group_by` | Dimension names or entities to group by. We require a reference to the entity of the dimension (other than for the primary time dimension), which is pre-appended to the front of the dimension name with a double underscore. | `group_by=['user__country', 'metric_time']` | Optional | +| `grain` | A parameter specific to any time dimension and changes the grain of the data from the default for the metric. | `group_by=[Dimension('metric_time')`
    `grain('week\|day\|month\|quarter\|year')]` | Optional | +| `where` | A where clause that allows you to filter on dimensions and entities using parameters - comes with `TimeDimension`, `Dimension`, and `Entity` objects. Granularity is required with `TimeDimension` | `"{{ where=Dimension('customer__country') }} = 'US')"` | Optional | +| `limit` | Limit the data returned | `limit=10` | Optional | +|`order` | Order the data returned | `order_by=['-order_gross_profit']` (remove `-` for ascending order) | Optional | +| `compile` | If true, returns generated SQL for the data platform but does not execute | `compile=True` | Optional | + + +## Note on time dimensions and `metric_time` + +You will notice that in the list of dimensions for all metrics, there is a dimension called `metric_time`. `Metric_time` is a reserved keyword for the measure-specific aggregation time dimensions. For any time-series metric, the `metric_time` keyword should always be available for use in queries. This is a common dimension across *all* metrics in a semantic graph. + +You can look at a single metric or hundreds of metrics, and if you group by `metric_time`, it will always give you the correct time series. + +Additionally, when performing granularity calculations that are global (not specific to a particular time dimension), we recommend you always operate on `metric_time` and you will get the correct answer. + +Note that `metric_time` should be available in addition to any other time dimensions that are available for the metric(s). In the case where you are looking at one metric (or multiple metrics from the same data source), the values in the series for the primary time dimension and `metric_time` are equivalent. + + +## Examples + +Refer to the following examples to help you get started with the JDBC API. + +### Fetch metadata for metrics + +You can filter/add any SQL outside of the templating syntax. For example, you can use the following query to fetch the name and dimensions for a metric: + +```bash +select name, dimensions from {{ + semantic_layer.metrics() + }} + WHERE name='food_order_amount' +``` + +### Query common dimensions + +You can select common dimensions for multiple metrics. Use the following query to fetch the name and dimensions for multiple metrics: + +```bash +select * from {{ + semantic_layer.dimensions(metrics=['food_order_amount', 'order_gross_profit']) + }} +``` + +### Query grouped by time + +The following example query uses the [shorthand method](#faqs) to fetch revenue and new customers grouped by time: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount','order_gross_profit'], + group_by=['metric_time']) + }} +``` + +### Query with a time grain + +Use the following example query to fetch multiple metrics with a change in time dimension granularities: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month')]) + }} +``` + +### Group by categorical dimension + +Use the following query to group by a categorical dimension: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month'), 'customer__customer_type']) + }} +``` + +### Query with where filters + +Where filters in API allow for a filter list or string. We recommend using the filter list for production applications as this format will realize all benefits from the where possible. + +Where filters have the following components that you can use: + +- `Dimension()` - This is used for any categorical or time dimensions. If used for a time dimension, granularity is required - `Dimension('metric_time').grain('week')` or `Dimension('customer__country')` + +- `TimeDimension()` - This is used for all time dimensions and requires a granularity argument - `TimeDimension('metric_time', 'MONTH)` + +- `Entity()` - This is used for entities like primary and foreign keys - `Entity('order_id')` + + +Use the following example to query using a `where` filter with the string format: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], +group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], +where="{{ TimeDimension('metric_time', 'MONTH') }} >= '2017-03-09' AND {{ Dimension('customer__customer_type' }} in ('new') AND {{ Entity('order_id') }} = 10") +}} +``` + +Use the following example to query using a `where` filter with a filter list format: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], +group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], +where=[{{ TimeDimension('metric_time', 'MONTH')}} >= '2017-03-09', {{ Dimension('customer__customer_type' }} in ('new'), {{ Entity('order_id') }} = 10]) +}} +``` + +### Query with a limit and order by + +Use the following example to query using a `limit` or `order_by` clauses: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10, + order_by=['order_gross_profit']) + }} +``` +### Query with compile keyword + +Use the following example to query using a `compile` keyword: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], + compile=True) + }} +``` + +## FAQs + +- **Why do some dimensions use different syntax, like `metric_time` versus `[Dimension('metric_time')`?**
    + When you select a dimension on its own, such as `metric_time` you can use the shorthand method which doesn't need the “Dimension” syntax. However, when you perform operations on the dimension, such as adding granularity, the object syntax `[Dimension('metric_time')` is required. + +- **What does the double underscore `"__"` syntax in dimensions mean?**
    + The double underscore `"__"` syntax indicates a mapping from an entity to a dimension, as well as where the dimension is located. For example, `user__country` means someone is looking at the `country` dimension from the `user` table. + +- **What is the default output when adding granularity?**
    + The default output follows the format `{time_dimension_name}__{granularity_level}`. So for example, if the time dimension name is `ds` and the granularity level is yearly, the output is `ds__year`. + +## Related docs + +- [dbt Semantic Layer integration best practices](/guides/dbt-ecosystem/sl-partner-integration-guide) + diff --git a/website/docs/docs/dbt-cloud-apis/sl-manifest.md b/website/docs/docs/dbt-cloud-apis/sl-manifest.md new file mode 100644 index 00000000000..47304accea3 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-manifest.md @@ -0,0 +1,99 @@ +--- +title: "Semantic manifest" +id: sl-manifest +description: "Learn about the semantic manifest.json file and how you can use artifacts to gain insights about your dbt Semantic Layer." +tags: [Semantic Layer, APIs] +sidebar_label: "Semantic manifest" +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +dbt creates an [artifact](/reference/artifacts/dbt-artifacts) file called the _Semantic Manifest_ (`semantic_manifest.json`), which MetricFlow requires to build and run metric queries properly for the dbt Semantic Layer. This artifact contains comprehensive information about your dbt Semantic Layer. It is an internal file that acts as the integration point with MetricFlow. + +By using the semantic manifest produced by dbt Core, MetricFlow will instantiate a data flow plan and generate SQL from Semantic Layer query requests. It's a valuable reference that you can use to understand the structure and details of your data models. + +Similar to the [`manifest.json` file](/reference/artifacts/manifest-json), the `semantic_manifest.json` also lives in the `/target` directory of your dbt project. This is where dbt stores various artifacts (such as compiled models and tests) generated during the execution of your project. + +## How it's produced + +The `semantic_manifest.json` is produced whenever your dbt project is parsed. The easiest way to generate the file yourself is to run `dbt parse`. Since `dbt run`, `dbt build`, and `dbt compile` all parse your dbt project, these commands will generate a semantic manifest as well. + + +## Top level keys + +Top-level keys for the semantic manifest are: +- `semantic_models` — Starting points of data with entities, dimensions, and measures, and correspond to models in your dbt project. +- `metrics` — Functions combining measures, constraints, and so on to define quantitative indicators. +- `project_configuration` — Contains information around your project configurations + +
    +Example target/semantic_manifest.json file + +```json +{ + "semantic_models": [ + { + "name": "semantic model name", + "defaults": null, + "description": "semantic model description", + "node_relation": { + "alias": "model alias", + "schema_name": "model schema", + "database": "model db", + "relation_name": "Fully qualified relation name" + }, + "entities": ["entities in the semantic model"], + "measures": ["measures in the semantic model"], + "dimensions": ["dimensions in the semantic model" ], + "metrics": [ + { + "name": "name of the metric", + "description": "metric description", + "type": "metric type", + "type_params": { + "measure": { + "name": "name for measure", + "filter": "filter for measure", + "alias": "alias for measure" + }, + "numerator": null, + "denominator": null, + "expr": null, + "window": null, + "grain_to_date": null, + "metrics": ["metrics used in defining the metric. this is used in derived metrics"], + "input_measures": [] + }, + "filter": null, + "metadata": null + } + ], + "project_configuration": { + "time_spine_table_configurations": [ + { + "location": "fully qualified table name for timespine", + "column_name": "date column", + "grain": "day" + } + ], + "metadata": null, + "dsi_package_version": {} + } +} + ] +} +``` + +
    + +## Related docs + +- [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) +- [About dbt artifacts](/reference/artifacts/dbt-artifacts) + diff --git a/website/docs/docs/dbt-cloud-environments.md b/website/docs/docs/dbt-cloud-environments.md index 5eccf3e7400..f61ec5ef72b 100644 --- a/website/docs/docs/dbt-cloud-environments.md +++ b/website/docs/docs/dbt-cloud-environments.md @@ -42,6 +42,6 @@ To use the IDE, each developer will need to set up [personal development credent ## Deployment environment -Deployment environments in dbt Cloud are crucial for executing scheduled jobs. A dbt Cloud project can have multiple deployment environments, allowing for flexibility and customization. +Deployment environments in dbt Cloud are necessary to execute scheduled jobs and use other features. A dbt Cloud project can have multiple deployment environments, allowing for flexibility and customization. However, a dbt Cloud project can only have one deployment environment that represents the production source of truth. -To learn more about dbt Cloud deployments and how to configure deployment environments, visit the [Deployment environments](/docs/deploy/deploy-environments) page. For our best practices guide, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) for more info. +To learn more about dbt Cloud deployment environments and how to configure them, visit the [Deployment environments](/docs/deploy/deploy-environments) page. For our best practices guide, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) for more info. diff --git a/website/docs/docs/dbt-support.md b/website/docs/docs/dbt-support.md index a6e9262200c..f63e016b03e 100644 --- a/website/docs/docs/dbt-support.md +++ b/website/docs/docs/dbt-support.md @@ -5,7 +5,7 @@ id: "dbt-support" ## dbt Core support -If you're developing in the command line (CLI) and have questions or need some help — reach out to the helpful dbt community through [the Community Forum](https://discourse.getdbt.com/) or [dbt Community slack](https://www.getdbt.com/community/join-the-community/). +If you're developing on the command line (CLI) and have questions or need some help — reach out to the helpful dbt community through [the Community Forum](https://discourse.getdbt.com/) or [dbt Community slack](https://www.getdbt.com/community/join-the-community/). ## dbt Cloud support diff --git a/website/docs/docs/dbt-versions/experimental-features.md b/website/docs/docs/dbt-versions/experimental-features.md index 35c64146149..5ed0cf037ca 100644 --- a/website/docs/docs/dbt-versions/experimental-features.md +++ b/website/docs/docs/dbt-versions/experimental-features.md @@ -21,3 +21,7 @@ To enable or disable experimental features: 2. Find Experimental features at the bottom of Your Profile page. 3. Click **Beta** to toggle the features on or off as shown in the following image. ![Experimental features](/img/docs/dbt-versions/experimental-feats.png) + +## Beta terms and conditions + +By using or enabling features that are not yet in general release ("Beta Features"), you agree to the [Beta Features Terms and Conditions](/assets/beta-tc.pdf). diff --git a/website/docs/docs/dbt-versions/product-lifecycles.md b/website/docs/docs/dbt-versions/product-lifecycles.md index f676c6af2eb..71f33110eb2 100644 --- a/website/docs/docs/dbt-versions/product-lifecycles.md +++ b/website/docs/docs/dbt-versions/product-lifecycles.md @@ -14,7 +14,7 @@ Any dbt feature will fall into one of the following lifecycle states: ### dbt Cloud -- **Beta:** Beta features may be made available for the purpose of customer testing and evaluation. These may not be feature-complete or fully stable. There may still be some planned additions and modifications to product behaviors while in Beta. Breaking changes may occur – although we will do our best to communicate them in advance, we may not always be able to do so. Beta features may not be fully documented, technical support may be limited, and service level objectives (SLOs) may not be provided. +- **Beta:** Beta features may be made available for the purpose of customer testing and evaluation. These might not be feature-complete or fully stable. There might still be some planned additions and modifications to product behaviors while in beta. Breaking changes could occur — although we will do our best to communicate them in advance, we might not always be able to do so. Beta features might not be fully documented, technical support might be limited, and service level objectives (SLOs) might not be provided. Download the [Beta Features Terms and Conditions](/assets/beta-tc.pdf) for more details. - **Preview (Private or Public):** Preview features are stable and can be considered for production deployments. There may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality to Preview features that is not backward compatible. Preview features include documentation, technical support, and include service level objectives (SLOs). Features in Preview are generally provided at no extra cost, although they may become paid features in their Generally Available state. diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md new file mode 100644 index 00000000000..fd2d163b748 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md @@ -0,0 +1,42 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "September 2023: dbt Cloud now has two types of jobs -- deploy jobs and CI jobs -- with streamlined setup and improved efficiency. " +sidebar_label: "Update: Improvements to dbt jobs" +tags: [Sept-2023, CI] +date: 2023-09-11 +sidebar_position: 10 +--- + +dbt Cloud now has two distinct job types: [deploy jobs](/docs/deploy/deploy-jobs) for building production data assets, and [continuous integration (CI) jobs](/docs/deploy/ci-jobs) for checking code changes. These jobs perform fundamentally different tasks so dbt Labs improved the setup experience with better defaults for each. + +With two types of jobs, instead of one generic type, we can better guide you through the setup flow. Best practices are built into the default settings so you can go from curious to being set up in seconds. + + + +And, we now have more efficient state comparisons on CI checks: never waste a build or test on code that hasn’t been changed. We now diff between the Git pull request (PR) code and what’s running in production more efficiently with the introduction of deferral to an environment versus a job. To learn more, refer to [Continuous integration in dbt Cloud](/docs/deploy/continuous-integration). + +Below is a comparison table that describes how deploy jobs and CI jobs behave differently: + +| | Deploy Jobs | CI Jobs | +| --- | --- | --- | +| Purpose | Builds production data assets. | Builds and tests new code before merging changes into production. | +| Trigger types | Triggered by a schedule or by API. | Triggered by a commit to a PR or by API. | +| Destination | Builds into a production database and schema. | Builds into a staging database and ephemeral schema, lived for the lifetime of the PR. | +| Execution mode | Runs execute sequentially, so as to not have collisions on the underlying DAG. | Runs execute in parallel to promote team velocity. | +| Efficiency run savings | Detects over-scheduled jobs and cancels unnecessary runs to avoid queue clog. | Cancels existing runs when a newer commit is pushed to avoid redundant work. | +| State comparison | Only sometimes needs to detect state. | Almost always needs to compare state against the production environment to build on modified code and its dependents. | + + +## What you need to update + +- If you want to set up a CI environment for your jobs, dbt Labs recommends that you create your CI job in a dedicated [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. To learn more about these environment best practices, refer to the guide [Get started with continuous integration tests](/guides/orchestration/set-up-ci/overview). + +- If you had set up a CI job before October 2, 2023, the job might've been misclassified as a deploy job with this update. Below describes how to fix the job type: + + If you used the [Create Job](/dbt-cloud/api-v2#/operations/Create%20Job) API endpoint but didn't set `"triggers":triggers.git_provider_webhook`, the job was misclassified as a deploy job and you must re-create it as described in [Trigger a CI job with the API](/docs/deploy/ci-jobs#trigger-a-ci-job-with-the-api). + + If you used the dbt Cloud UI but didn't enable the **Run on Pull Requests** option that was in the **Continuous Integration** (CI) tab, the job was misclassified as a deploy job and you must re-create it as described in [Set up CI jobs](/docs/deploy/ci-jobs#set-up-ci-jobs). + + To check for the job type, review your CI jobs in dbt Cloud's [Run History](/docs/deploy/run-visibility#run-history) and check for the **CI Job** tag below the job name. If it doesn't have this tag, it was misclassified and you need to re-create the job. + + diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md new file mode 100644 index 00000000000..174de2bdaaf --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md @@ -0,0 +1,16 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "September 2023: Improved deletion of temporary schemas" +sidebar_label: "Update: Improved automatic deletion of temporary schemas" +tags: [Sept-2023, CI] +date: 2023-09-18 +sidebar_position: 08 +--- + +Temporary schemas are now being automatically deleted (dropped) for all adapters (like Databricks), PrivateLink connections, and environment variables in connection strings. + +dbt Labs has rearchitected how schema deletion works for [continuous integration (CI)](/docs/deploy/continuous-integration) runs. We created a new service to delete any schema with a prefix of `dbt_cloud_pr_` that's been generated by a PR run. + +However, temporary schemas will not be automatically deleted if: +- Your project overrides the [generate_schema_name macro](/docs/build/custom-schemas) but it doesn't contain the required prefix `dbt_cloud_pr_`. For details, refer to [Troubleshooting](/docs/deploy/ci-jobs#troubleshooting). +- You're using a [non-native Git integration](/docs/deploy/ci-jobs#trigger-a-ci-job-with-the-api). This is because automatic deletion relies on incoming webhooks from Git providers, which is only available through the native integrations. diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md new file mode 100644 index 00000000000..a647bb5f585 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md @@ -0,0 +1,43 @@ +--- +title: "Summer 2023 product docs updates" +id: "product-docs-summer" +description: "Summer 2023: The Product docs team merged 256 PRs, made various updates to dbt Cloud and Core, such as adding What's New, writing Semantic Layer beta docs, releasing dbt 1.6 docs, and more!" +sidebar_label: "Update: Product docs changes" +tags: [July-2023, Aug-2023, product-docs] +date: 2023-09-13 +sidebar_position: 09 +--- + +Hello from dbt's Product Documentation team (the stewards of the docs.getdbt.com site): @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun. What a busy summer! We merged 256 PRs between July 1st and August 31. + +We'd like to recognize all of the docs and support from our partner team, Developer Experience: @jasnonaz @gwenwindflower @dbeatty10 @dataders @joellabes @Jstein77 @dave-connors-3! + +We'd also like to give a special thanks to the 22 community members who contributed to the [dbt Product docs](https://docs.getdbt.com) for the first time. :pray: Based on feedback from the dbt community, we made these changes: + +- Added a [permissions table](/docs/cloud/manage-access/enterprise-permissions) for Enterprise accounts +- Added a [browser session page](/docs/cloud/about-cloud/browsers#browser-sessions) that clarifies dbt Cloud’s browser session time and when it logs users off. + +You can provide feedback by opening a pull request or issue in [our repo](https://github.com/dbt-labs/docs.getdbt.com) or reaching out in the dbt community Slack channel [#dbt-product-docs](https://getdbt.slack.com/archives/C0441GSRU04)). + +## :zap: General docs projects + +* Added the ability to collapse sections you’re not currently looking at. There were quite a few people who wanted this, and it bugged us too, so we were happy to get this shipped! +* Introduced the idea of [“Trusted” adapters](/docs/supported-data-platforms#types-of-adapters). + +## ☁ Cloud projects + +* The **What’s new?** product update widget is back in the dbt Cloud UI! The Docs team will begin updating the content to keep you informed about new features. +* Launched the re-released [Semantic Layer beta docs](/docs/use-dbt-semantic-layer/dbt-sl), which introduces users to the new API, new guide to set up MetricFlow and the new Semantic Layer, as well as revamp the ‘Use the dbt Semantic Layer’ section for users. +* Updated [Admin API v2 and v3](/docs/dbt-cloud-apis/admin-cloud-api) to help you understand the differences between them and which version includes the endpoints you use. +* To improve discoverability, the docs team made changes to the [deploy dbt sidebar](/docs/deploy/deployments). We added cards and aligned better with the dbt Cloud UI and the way it’s used. +* Deprecated legacy job schemas in the [Discovery API](/docs/dbt-cloud-apis/discovery-api). +* Added a page to describe [experimental and beta features](/docs/dbt-versions/experimental-features) in dbt Cloud and what you need to know about them. +* Added a section to introduce a new beta feature [**Extended Attributes**](/docs/dbt-cloud-environments#extended-attributes-beta), which allows users to set a flexible `profiles.yml` snippet in their dbt Cloud Environment settings. +## 🎯 Core projects + +* We released [dbt 1.6](/guides/migration/versions/upgrading-to-v1.6)! We added docs for the new commands `dbt retry` and `dbt clone` + +## New 📚 Guides, ✏️ blog posts, and FAQs +* Check out how these community members use the dbt community in the [Community spotlight](/community/spotlight). +* Blog posts published this summer include [Optimizing Materialized Views with dbt](/blog/announcing-materialized-views), [Data Vault 2.0 with dbt Cloud](/blog/data-vault-with-dbt-cloud), and [Create dbt Documentation and Tests 10x faster with ChatGPT](/blog/create-dbt-documentation-10x-faster-with-chatgpt) +* We now have two new best practice guides: [How we build our metrics](/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) and [Set up Continuous Integration](/guides/orchestration/set-up-ci/overview). diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md new file mode 100644 index 00000000000..0b588376c34 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md @@ -0,0 +1,15 @@ +--- +title: "Update: Removing old (prerelease) versions of dbt from dbt Cloud when (latest) is available" +description: "Sept 2023: Improving the version selection options by removing prerelease versions whenever the latest version is available." +sidebar_label: "Update: Removing old prerelease versions from dbt Cloud" +tags: [Sept-2023, Versions] +date: 2023-09-26 +sidebar_position: 07 +--- + +Previously, when dbt Labs released a new [version](/docs/dbt-versions/core#how-dbt-core-uses-semantic-versioning) in dbt Cloud, the older patch _prerelease_ version and the _latest_ version remained as options in the dropdown menu available in the **Environment settings**. Now, when the _latest_ version is released, the _prerelease_ version will be removed and all customers remaining on it will be migrated seamlessly. There will be no interruptions to service when this migration occurs. + +To see which version you are currently using and to upgrade, select **Deploy** in the top navigation bar and select **Environments**. Choose the preferred environment and click **Settings**. Click **Edit** to make a change to the current dbt version. dbt Labs recommends always using the latest version whenever possible to take advantage of new features and functionality. + + + \ No newline at end of file diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md new file mode 100644 index 00000000000..cd088b92fab --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md @@ -0,0 +1,126 @@ +--- +title: "Deprecation: Query patterns and endpoints in the dbt Cloud Discovery API" +description: "August 2023: Learn about the upcoming deprecation of certain endpoints and query patterns in the Discovery API." +sidebar_position: 6 +sidebar_label: "Deprecation: Certain Discovery API endpoints and query patterns" +tags: [Aug-2023, API] +date: 2023-08-31 +--- + +dbt Labs has deprecated and will be deprecating certain query patterns and replacing them with new conventions to enhance the performance of the dbt Cloud [Discovery API](/docs/dbt-cloud-apis/discovery-api). + +All these changes will be in effect on _September 7, 2023_. + +We understand that these changes might require adjustments to your existing integration with the Discovery API. Please [contact us](mailto:support@getdbt.com) with any questions. We're here to help you during this transition period. + +## Job-based queries + +Job-based queries that use the data type `Int` for IDs will be deprecated. They will be marked as deprecated in the [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql). The new convention will be for you to use the data type `BigInt` instead. + +This change will be in effect starting September 7, 2023. + + +Example of query before deprecation: + +```graphql +query ($jobId: Int!) { + models(jobId: $jobId){ + uniqueId + } +} +``` + +Example of query after deprecation: + +```graphql +query ($jobId: BigInt!) { + job(id: $jobId) { + models { + uniqueId + } + } +} +``` + +## modelByEnvironment queries + +The `modelByEnvironment` object has been renamed and moved into the `environment` object. This change is in effect and has been since August 15, 2023. + +Example of query before deprecation: + +```graphql +query ($environmentId: Int!, $uniqueId: String) { + modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } +} +``` + +Example of query after deprecation: + +```graphql +query ($environmentId: BigInt!, $uniqueId: String) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } + } + } +} +``` + + +## Environment and account queries + +Environment and account queries that use `Int` as a data type for ID have been deprecated. IDs must now be in `BigInt`. This change is in effect and has been since August 15, 2023. + + +Example of query before deprecation: + +```graphql +query ($environmentId: Int!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + + +Example of query after deprecation: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + + diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md new file mode 100644 index 00000000000..10baa5cd6d7 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md @@ -0,0 +1,38 @@ +--- +title: "Update: Cloud IDE v1.2 includes a new service" +description: "August 2023: Cloud IDE now uses dbt-server to provide more reliable service and dbt Core feature parity, including support for commands like `dbt list`." +sidebar_label: "Update: Cloud IDE v1.2" +tags: [Aug-2023, IDE] +date: 2023-08-03 +sidebar_position: 8 +--- + +We're excited to announce that we replaced the backend service that powers the Cloud IDE with a more reliable server -- dbt-server. Because this release contains foundational changes, IDE v1.2 requires dbt v1.6 or higher. This significant update follows the rebuild of the IDE frontend last year. We're committed to improving the IDE to provide you with a better experience. + +Previously, the Cloud IDE used dbt-rpc, an outdated service that was unable to stay up-to-date with changes from dbt-core. The dbt-rpc integration used legacy dbt-core entry points and logging systems, causing it to be sluggish, brittle, and poorly tested. The Core team had been working around this outdated technology to avoid breaking it, which prevented them from developing with velocity and confidence. + +## New features + +- **Better dbt-core parity:** The Cloud IDE has better command parity with dbt-core, including support for commands like `dbt list` and improved treatment of flags like `--vars`, `--fail-fast`, etc. +- **Improved maintainability:** With the new dbt-server, it's easier to fix bugs and improve the overall quality of the product. With dbt-rpc, fixing bugs was a time-consuming and challenging process that required extensive testing. With the new service, we can identify and fix bugs more quickly, resulting in a more stable and reliable IDE. +- **A more reliable service:** Simplified architecture that's less prone to failure. + +### Product refinements + +- Improved `Preview` capabilities with Core v1.6 + IDE v1.2. [This Loom](https://www.loom.com/share/12838feb77bf463c8585fc1fc6aa161b) provides more information. + +### Bug fixes + +- Global page can become "inert" and stop handling clicks +- Switching back and forth between files in the git diff view can cause overwrite +- Browser gets stuck during markdown preview for doc with large table +- Editor right click menu is offset +- Unable to Cancel on the Save New File component when Closing All Files in the IDE +- Mouse flicker in the modal's file tree makes it difficult to select a folder where you want to save a new file +- Snapshots not showing in Lineage when inside a subfolder and is mixed cased named +- Tooltips do not work for Format and Save +- When a dbt invocation is in progress or if parsing is ongoing, attempting to switch branches will cause the `Git Branch` dropdown to close automatically + +### Known issues + +- `{{this}}` function does not display properly in preview/compile with dbt-server diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md new file mode 100644 index 00000000000..921ed6dcd79 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md @@ -0,0 +1,65 @@ +--- +title: "Enhancement: Revamped dbt Semantic Layer available in public beta" +description: "August 2023: The revamped dbt Semantic Layer, now available in public beta, introduces new semantic components and evolves the semantic layer's capability." +sidebar_label: "Enhancement: Revamped dbt Semantic Layer in public beta" +tags: [Aug-2023, dbt Semantic Layer] +date: 2023-08-03 +sidebar_position: 7 +--- + +:::important +If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. +::: + +dbt Labs are thrilled to announce the re-release of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), now available in [public beta](#public-beta). It aims to bring the best of modeling and semantics to downstream applications by introducing: + +- [MetricFlow](/docs/build/about-metricflow) is a framework for constructing performant and legible SQL from an all new set of semantic constructs which include semantic models, entities, and metrics. +- New Semantic Layer infrastructure that enables support for more data platforms (Snowflake, Databricks, BigQuery, Redshift, and soon more), along with improved performance. +- New and improved [developer workflows](/guides/migration/sl-migration), governance, and collaboration features. +- New [Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) using JDBC to query metrics and build integrations. + +With semantics at its core, the dbt Semantic Layer marks a crucial milestone towards a new era of centralized logic and data applications. + + + +## Enhanced dbt Semantic Layer + +What sets the dbt Semantic Layer apart is its ability to centralize logic for many downstream data applications, streamlining access and governance and enabling more efficient utilization of data models. It provides a consistent view of data while simplifying complex tasks in downstream applications and reducing the costs of and barriers to data access. + +We are excited to present several important capabilities with the enhanced dbt Semantic Layer: + +- **Consistent organization**: Provides a consistent view of data, ensuring that metrics and definitions match across the organization and the breadth of interfaces where data is consumed. This fosters trust in data and drives better decision-making by eliminating inconsistencies and errors that come up when individual users define metrics independently. + +- **Improved governance**: The dbt Semantic Layer ensures proper governance and auditing of data changes, providing an auditable record of modifications and clear ownership. This saves time by making it clear who can create and manage new metrics, ensuring accountability and data integrity. + +- **Reduce costs**: The dbt Semantic Layer simplifies complex tasks, such as bridging entities across a semantic graph. Often users duplicate slices and dice of data and make them available in a data platform, making it difficult to manage and causing high computation. The dbt Semantic Layer minimizes duplication of work and reduces computational costs - allowing users to focus on analyzing data rather than navigating intricate technical processes or duplicating work. + +- **Enhanced efficiency**: With the dbt Semantic Layer, data teams can create and update metrics using a new set of validations that make defining and iterating on metrics efficient. The streamlined development workflows makes it simpler for a data team to serve large organizations with broad data needs. + +- **Accessible data**: Defining common metrics and dimensions and making them joinable, makes access simpler for users with less expertise in the specifics of a company's data modeling work. This creates opportunities to leverage data insights, fostering collaboration and driving innovation in a more inclusive data environment. + +By bringing these enhancements to the dbt Semantic Layer, we enable organizations of all sizes and industries to leverage the power of semantics in their data workflows. + +## Public beta + +The dbt Semantic Layer is currently available as a public beta, which means: + +- **Who** — To experience the new dbt Semantic Layer, you must be on a dbt Cloud [Team and Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plan, [hosted](/docs/cloud/about-cloud/regions-ip-addresses) in North America and on dbt v1.6 and higher. Look out for announcements on removing the location requirement soon. + + - Developer plans or dbt Core users can use MetricFlow to define and test metrics using the dbt MetricFlow CLI only. + +- **What** — Public beta provides early access to new features. The dbt Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to general availability later this year. We may also introduce new functionality that isn't backwards compatible. We provide support, and relevant service level objectives (SLOs) apply. If you have any questions on pricing, please reach out to your account representative. + +- **When** — Public beta starts on August 1st, 2023. + +- **Where** — You can experience the dbt Semantic Layer in dbt Cloud. Public beta is enabled at the account level so you don’t need to worry about enabling it per user. + +## Next steps + +To experience the universal dbt Semantic Layer and its enhanced beta capabilities, check out: + +- [Introducing the new dbt Semantic Layer](https://www.getdbt.com/blog/introducing-new-look-dbt-semantic-layer) +- [dbt Semantic Layer docs](/docs/use-dbt-semantic-layer/dbt-sl) +- [dbt Semantic Layer get started guide](/docs/use-dbt-semantic-layer/quickstart-sl) +- [Build your metrics with MetricFlow](/docs/build/build-metrics-intro) + diff --git a/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md b/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md index 0f88f1d2fa8..ba82234c0b5 100644 --- a/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md +++ b/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md @@ -2,7 +2,7 @@ title: "Enhancement: Faster run starts and unlimited job concurrency" description: "We have enhanced the dbt Cloud Scheduler by reducing prep time for all accounts and provided unlimited job concurrency for Enterprise accounts." sidebar_label: "Enhancement: Faster run starts and unlimited job concurrency" -tags: [07-2023, scheduler] +tags: [July-2023, scheduler] date: 2023-07-06 sidebar_position: 10 --- diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md index c4caf42f355..fa02a6d9bd8 100644 --- a/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md @@ -1,17 +1,17 @@ --- title: "Update: Improvements to dbt Cloud continuous integration" -description: "dbt Cloud's CI checks now run in parallel, will not block production runs, and stale runs are automatically cancelled when a newer commit is pushed." +description: "dbt Cloud's CI checks now run in parallel, will not block production runs, and stale runs are automatically canceled when a newer commit is pushed." sidebar_label: "Update: Improvements to continuous integration" tags: [June-2023, CI] date: 2023-06-20 sidebar_position: 8 --- -dbt Cloud Slim CI is a critical part of the analytics engineering workflow. Large teams rely on process to ensure code quality is high, and they look to dbt Cloud CI to automate testing code changes in an efficient way, enabling speed while keep the bar high. With status checks directly posted to their dbt PRs, developers gain the confidence that their code changes will work as expected in production, and once you’ve grown accustomed to seeing that green status check in your PR, you won’t be able to work any other way. +dbt Cloud CI is a critical part of the analytics engineering workflow. Large teams rely on process to ensure code quality is high, and they look to dbt Cloud CI to automate testing code changes in an efficient way, enabling speed while keep the bar high. With status checks directly posted to their dbt PRs, developers gain the confidence that their code changes will work as expected in production, and once you’ve grown accustomed to seeing that green status check in your PR, you won’t be able to work any other way. -What separates dbt Cloud CI from other CI providers is its ability to keep track of state of what’s running in your production environment, so that when you run a Slim CI job, only the modified data assets in your pull request and their downstream dependencies get built and tested in a staging schema. dbt Cloud aims to make each CI check as efficient as possible, so as to not waste any data warehouse resources. As soon as the Slim CI run completes, its status posts directly back to the PR in GitHub, GitLab, or Azure DevOps, depending on which Git provider you’re using. Teams can set up guardrails to let only PRs with successful CI checks be approved for merging, and the peer review process is greatly streamlined because dbt Cloud does the first testing pass. +What separates dbt Cloud CI from other CI providers is its ability to keep track of state of what’s running in your production environment, so that when you run a CI job, only the modified data assets in your pull request and their downstream dependencies get built and tested in a staging schema. dbt Cloud aims to make each CI check as efficient as possible, so as to not waste any data warehouse resources. As soon as the CI run completes, its status posts directly back to the PR in GitHub, GitLab, or Azure DevOps, depending on which Git provider you’re using. Teams can set up guardrails to let only PRs with successful CI checks be approved for merging, and the peer review process is greatly streamlined because dbt Cloud does the first testing pass. We're excited to introduce a few critical capabilities to dbt Cloud CI that will improve productivity and collaboration in your team’s testing and integration workflow. As of this week, you can now: @@ -21,4 +21,4 @@ We're excited to introduce a few critical capabilities to dbt Cloud CI that will - **Run CI checks without blocking production runs**. CI checks will no longer consume run slots, meaning you can have as many CI checks running as you want, without impeding your production jobs. -To learn more, refer to [Continuous integration](/docs/deploy/continuous-integration) and [Slim CI jobs](/docs/deploy/slim-ci-jobs). +To learn more, refer to [Continuous integration](/docs/deploy/continuous-integration) and [CI jobs](/docs/deploy/ci-jobs). diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md index 9217736a2d8..469d2ac362b 100644 --- a/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md @@ -13,11 +13,11 @@ Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/) in June: ## ☁ Cloud projects -- We clarified the nuances of [CI and Slim CI jobs](/docs/deploy/continuous-integration), updated the [Scheduler content](/docs/deploy/job-scheduler), added two new pages for the job settings and run visibility, moved the project state page to the [Syntax page](/reference/node-selection/syntax), and provided a landing page for [Deploying with Cloud](/docs/deploy/dbt-cloud-job) to help readers navigate the content better. +- We clarified the nuances of [CI and CI jobs](/docs/deploy/continuous-integration), updated the [Scheduler content](/docs/deploy/job-scheduler), added two new pages for the job settings and run visibility, moved the project state page to the [Syntax page](/reference/node-selection/syntax), and provided a landing page for [Deploying with Cloud](/docs/deploy/jobs) to help readers navigate the content better. - We reformatted the [Supported data platforms page](/docs/supported-data-platforms) by adding dbt Cloud to the page, splitting it into multiple pages, using cards to display verified adapters, and moving the [Warehouse setup pages](/docs/core/connect-data-platform/about-core-connections) to the Docs section. - We launched a new [Lint and format page](/docs/cloud/dbt-cloud-ide/lint-format), which highlights the awesome new dbt Cloud IDE linting/formatting function. - We enabled a connection between [dbt Cloud release notes](/docs/dbt-versions/dbt-cloud-release-notes) and the dbt Slack community. This means new dbt Cloud release notes are automatically sent to the slack community [#dbt-cloud channel](https://getdbt.slack.com/archives/CMZ2V0X8V) via RSS feed, keeping users up to date with changes that may affect them. -- We’ve added two new docs links in the dbt Cloud Job settings user interface (UI). This will provide additional guidance and help users succeed when setting up a dbt Cloud job: [job commands](/docs/deploy/job-commands) and [job triggers](/docs/deploy/job-triggers). +- We’ve added two new docs links in the dbt Cloud Job settings user interface (UI). This will provide additional guidance and help users succeed when setting up a dbt Cloud job: [job commands](/docs/deploy/job-commands) and job triggers. - We added information related to the newly created [IT license](/docs/cloud/manage-access/about-user-access#license-based-access-control), available for Team and Enterprise plans. - We added a new [Supported browser page](/docs/cloud/about-cloud/browsers), which lists the recommended browsers for dbt Cloud. - We launched a new page informing users of [new Experimental features option](/docs/dbt-versions/experimental-features) in dbt Cloud. diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md index 5503b40576d..d85ffa154dd 100644 --- a/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md @@ -1,46 +1,46 @@ ---- -title: "May IDE updates and fixes" -id: "may-ide-updates" -description: "May 2023 release note: We've launched SQLFluff in beta, released an IDE UI page, significantly improved IDE performance, improved error messages, fixed bugs, and more." -sidebar_label: "Update and fixes: IDE" -sidebar_position: 2 -tags: [May-2023, IDE] ---- - -To continue improving your [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) development experience, the dbt Labs team continues to work on adding new features, fixing bugs, and increasing reliability ✨. - -Stay up-to-date with [IDE-related changes](/tags/ide). - -## New features -- Lint via SQL Fluff is now available in beta (GA over the next 2-3 weeks) -- Format markdown files with prettier -- Leverage developer experience shortcuts, including ``Ctrl + ` `` (toggle history drawer), `CMD + Option + /` (toggle block comment), `CMD + Shift + P` (open command palette), `Option + W` (close editor tab) -- Display parent folder name for files with same name in Changes section -- Navigate the new IDE features quickly using [the IDE User Interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) help page -- Use `top X` in SQL when previewing in the IDE -- Opt into the new IDE backend layer over the past month (still with dbt-rpc). Ready for beta later in June! - - -## Product refinements - -- Performance-related upgrades: - - Reduced cold start time by 60+% - - Improved render time of modals in the IDE by 98% - - Improved IDE performance with dbt Core v1.5+ (faster and snappier – highly encourage you to [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud)!) -- Upgraded sqlfmt (which powers the Format button) to 0.18.0 -- Updated Build button to change menu options based on file/model type (snapshot, macro, etc.) -- Display message to disable adblocker for file contents error -- Moved Format button to console bar -- Made many security enhancements in the IDE -## Bug fixes - -- File icon sizes no longer get wonky in small screen -- Toast notifications no longer take over command bar menu -- Hover info inside the text editor no longer gets cut off -- Transition between a file and a recently modified scratchpad no longer triggers a console error -- dbt v1.5+ now can access the IDE -- Confirm button on the Unsaved Changes modal now closes after clicking it -- Long node names no longer overflow in the parsed logs section in history drawer -- Status pill in history drawer no longer scales with longer command -- Tooltip for tab name with a long file name is no longer cut off -- Lint button should no longer available in main branch +--- +title: "May IDE updates and fixes" +id: "may-ide-updates" +description: "May 2023 release note: We've launched SQLFluff in beta, released an IDE UI page, significantly improved IDE performance, improved error messages, fixed bugs, and more." +sidebar_label: "Update and fixes: IDE" +sidebar_position: 2 +tags: [May-2023, IDE] +--- + +To continue improving your [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) development experience, the dbt Labs team continues to work on adding new features, fixing bugs, and increasing reliability ✨. + +Stay up-to-date with [IDE-related changes](/tags/ide). + +## New features +- Lint via SQL Fluff is now available in beta (GA over the next 2-3 weeks) +- Format markdown files with prettier +- Leverage developer experience shortcuts, including ``Ctrl + ` `` (toggle history drawer), `CMD + Option + /` (toggle block comment), `CMD + Shift + P` (open command palette), `Option + W` (close editor tab) +- Display parent folder name for files with same name in Changes section +- Navigate the new IDE features quickly using [the IDE User Interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) help page +- Use `top X` in SQL when previewing in the IDE +- Opt into the new IDE backend layer over the past month (still with dbt-rpc). Ready for beta later in June! + + +## Product refinements + +- Performance-related upgrades: + - Reduced cold start time by 60+% + - Improved render time of modals in the IDE by 98% + - Improved IDE performance with dbt Core v1.5+ (faster and snappier – highly encourage you to [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud)!) +- Upgraded sqlfmt (which powers the Format button) to 0.18.0 +- Updated Build button to change menu options based on file/model type (snapshot, macro, etc.) +- Display message to disable adblocker for file contents error +- Moved Format button to console bar +- Made many security enhancements in the IDE +## Bug fixes + +- File icon sizes no longer get wonky in small screen +- Toast notifications no longer take over command bar menu +- Hover info inside the text editor no longer gets cut off +- Transition between a file and a recently modified scratchpad no longer triggers a console error +- dbt v1.5+ now can access the IDE +- Confirm button on the Unsaved Changes modal now closes after clicking it +- Long node names no longer overflow in the parsed logs section in history drawer +- Status pill in history drawer no longer scales with longer command +- Tooltip for tab name with a long file name is no longer cut off +- Lint button should no longer available in main branch diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md index 991fc9be1f4..d30bcf85b99 100644 --- a/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md @@ -20,7 +20,7 @@ Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleo * [dbt Cloud quickstart guide](/quickstarts/starburst-galaxy),  * [connection page](/docs/cloud/connect-data-platform/connect-starburst-trino),  * [set up page](/docs/core/connect-data-platform/trino-setup), and [config page](/reference/resource-configs/trino-configs). -- Enhanced [dbt Cloud jobs page](/docs/deploy/dbt-cloud-job) and section to include conceptual info on the queue time, improvements made around it, and about failed jobs. +- Enhanced [dbt Cloud jobs page](/docs/deploy/jobs) and section to include conceptual info on the queue time, improvements made around it, and about failed jobs. - Check out the April dbt [Cloud release notes](/docs/dbt-versions/dbt-cloud-release-notes) ## 🎯 Core projects diff --git a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md index 0e26d8dc628..bdc89b4abde 100644 --- a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md +++ b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md @@ -8,7 +8,7 @@ tags: [Dec-2022] To help save compute time, new jobs will no longer be triggered to run by default. When you create a new job in dbt Cloud, you can trigger the job to run by selecting **Run on schedule** and completing the desired schedule and timing information. -For more information, refer to [Job triggers](/docs/deploy/job-triggers). +For more information, refer to [Deploy jobs](/docs/deploy/deploy-jobs). diff --git a/website/docs/docs/deploy/ci-jobs.md b/website/docs/docs/deploy/ci-jobs.md new file mode 100644 index 00000000000..fb603e2864e --- /dev/null +++ b/website/docs/docs/deploy/ci-jobs.md @@ -0,0 +1,158 @@ +--- +title: "Continuous integration jobs in dbt Cloud" +sidebar_label: "CI jobs" +description: "Learn how to create and set up CI checks to test code changes before deploying to production." +--- + +You can set up [continuous integration](/docs/deploy/continuous-integration) (CI) jobs to run when someone opens a new pull request (PR) in your dbt Git repository. By running and testing only _modified_ models, dbt Cloud ensures these jobs are as efficient and resource conscientious as possible on your data platform. + + +## Set up CI jobs {#set-up-ci-jobs} + +dbt Labs recommends that you create your CI job in a dedicated dbt Cloud [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. Having a separate environment dedicated for CI will provide better isolation between your temporary CI schema builds and your production data builds. Additionally, sometimes teams need their CI jobs to be triggered when a PR is made to a branch other than main. If your team maintains a staging branch as part of your release process, having a separate environment will allow you to set a [custom branch](/faqs/environments/custom-branch-settings) and, accordingly, the CI job in that dedicated environment will be triggered only when PRs are made to the specified custom branch. To learn more, refer to [Get started with CI tests](/guides/orchestration/set-up-ci/overview). + +### Prerequisites +- You have a dbt Cloud account. +- For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your dbt Cloud account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You must be connected using dbt Cloud’s native Git integration with [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or [Azure DevOps](/docs/cloud/git/connect-azure-devops). + - If you’re using GitLab, you must use a paid or self-hosted account which includes support for GitLab webhooks. + - If you previously configured your dbt project by providing a generic git URL that clones using SSH, you must reconfigure the project to connect through dbt Cloud's native integration. + + +To make CI job creation easier, many options on the **CI job** page are set to default values that dbt Labs recommends that you use. If you don't want to use the defaults, you can change them. + +1. On your deployment environment page, click **Create Job** > **Continuous Integration Job** to create a new CI job. + +2. Options in the **Job Description** section: + - **Job Name** — Specify the name for this CI job. + - **Environment** — By default, it’s set to the environment you created the CI job from. + - **Triggered by pull requests** — By default, it’s enabled. Every time a developer opens up a pull request or pushes a commit to an existing pull request, this job will get triggered to run. + +3. Options in the **Execution Settings** section: + - **Commands** — By default, it includes the `dbt build --select state:modified+` command. This informs dbt Cloud to build only new or changed models and their downstream dependents. Importantly, state comparison can only happen when there is a deferred environment selected to compare state to. Click **Add command** to add more [commands](/docs/deploy/job-commands) that you want to be invoked when this job runs. + - **Compare changes against an environment (Deferral)** — By default, it’s set to the **Production** environment if you created one. This option allows dbt Cloud to check the state of the code in the PR against the code running in the deferred environment, so as to only check the modified code, instead of building the full table or the entire DAG. + + :::info + Older versions of dbt Cloud only allow you to defer to a specific job instead of an environment. Deferral to a job compares state against the project code that was run in the deferred job's last successful run. While deferral to an environment is more efficient as dbt Cloud will compare against the project representation (which is stored in the `manifest.json`) of the last successful deploy job run that executed in the deferred environment. By considering _all_ [deploy jobs](/docs/deploy/deploy-jobs) that run in the deferred environment, dbt Cloud will get a more accurate, latest project representation state. + ::: + + - **Generate docs on run** — Enable this option if you want to [generate project docs](/docs/collaborate/build-and-view-your-docs) when this job runs. This option is disabled by default since most teams do not want to test doc generation on every CI check. + + + +4. (optional) Options in the **Advanced Settings** section: + - **Environment Variables** — Define [environment variables](/docs/build/environment-variables) to customize the behavior of your project when this CI job runs. You can specify that a CI job is running in a _Staging_ or _CI_ environment by setting an environment variable and modifying your project code to behave differently, depending on the context. It's common for teams to process only a subset of data for CI runs, using environment variables to branch logic in their dbt project code. + - **Target Name** — Define the [target name](/docs/build/custom-target-names). Similar to **Environment Variables**, this option lets you customize the behavior of the project. You can use this option to specify that a CI job is running in a _Staging_ or _CI_ environment by setting the target name and modifying your project code to behave differently, depending on the context. + - **Run Timeout** — Cancel this CI job if the run time exceeds the timeout value. You can use this option to help ensure that a CI check doesn't consume too much of your warehouse resources. + - **dbt Version** — By default, it’s set to inherit the [dbt version](/docs/dbt-versions/core) from the environment. dbt Labs strongly recommends that you don't change the default setting. This option to change the version at the job level is useful only when you upgrade a project to the next dbt version; otherwise, mismatched versions between the environment and job can lead to confusing behavior. + - **Threads** — By default, it’s set to 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. + - **Run source freshness** — Enable this option to invoke the `dbt source freshness` command before running this CI job. Refer to [Source freshness](/docs/deploy/source-freshness) for more details. + + + + +## Trigger a CI job with the API + +If you're not using dbt Cloud’s native Git integration with [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or [Azure DevOps](/docs/cloud/git/connect-azure-devops), you can use the [Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) to trigger a CI job to run. However, dbt Cloud will not automatically delete the temporary schema for you. This is because automatic deletion relies on incoming webhooks from Git providers, which is only available through the native integrations. + +### Prerequisites + +- You have a dbt Cloud account. +- For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your dbt Cloud account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). + + +1. Set up a CI job with the [Create Job](/dbt-cloud/api-v2#/operations/Create%20Job) API endpoint using `"job_type": ci` or from the [dbt Cloud UI](#set-up-ci-jobs). +1. Call the [Trigger Job Run](/dbt-cloud/api-v2#/operations/Trigger%20Job%20Run) API endpoint to trigger the CI job. Provide the pull request (PR) ID to the payload using one of these fields, even if you're using a different Git provider (like Bitbucket): + + - `github_pull_request_id` + - `gitlab_merge_request_id` + - `azure_devops_pull_request_id`  + + This can make your code less human-readable but it will _not_ affect dbt functionality. + +## Example pull requests + +The green checkmark means the dbt build and tests were successful. Clicking on the dbt Cloud section navigates you to the relevant CI run in dbt Cloud. + +### GitHub pull request example + + + +### GitLab pull request example + + + +### Azure DevOps pull request example + + + + +## Troubleshooting + +If you're experiencing any issues, review some of the common questions and answers below. + +
    + Temporary schemas aren't dropping +
    +
    If your temporary schemas aren't dropping after a PR merges or closes, this typically indicates you have overridden the generate_schema_name macro and it isn't using dbt_cloud_pr_ as the prefix.



    To resolve this, change your macro so that the temporary PR schema name contains the required prefix. For example: +



    + • ✅ Temporary PR schema name contains the prefix dbt_cloud_pr_ (like dbt_cloud_pr_123_456_marketing)

    + • ❌ Temporary PR schema name doesn't contain the prefix dbt_cloud_pr_ (like marketing).

    +
    +
    +
    +
    + Reconnecting your dbt project to use dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps +
    +
    If your dbt project relies the generic git clone method that clones using SSH and deploy keys to connect to your dbt repo, you need to disconnect your repo and reconnect it using the native GitHub, GitLab, or Azure DevOps integration in order to enable dbt Cloud CI.



    + First, make sure you have the native GitHub authentication, native GitLab authentication, or native Azure DevOps authentication set up depending on which git provider you use. After you have gone through those steps, go to Account Settings, select Projects and click on the project you'd like to reconnect through native GitHub, GitLab, or Azure DevOps auth. Then click on the repository link.



    + + Once you're in the repository page, select Edit and then Disconnect Repository at the bottom.

    + +

    + Confirm that you'd like to disconnect your repository. You should then see a new Configure a repository link in your old repository's place. Click through to the configuration page:

    + +

    + + Select the GitHub, GitLab, or AzureDevOps tab and reselect your repository. That should complete the setup of the project and enable you to set up a dbt Cloud CI job.
    +
    +
    +
    + Error messages that refer to schemas from previous PRs +
    +
    If you receive a schema-related error message referencing a previous PR, this is usually an indicator that you are not using a production job for your deferral and are instead using self. If the prior PR has already been merged, the prior PR's schema may have been dropped by the time the CI job for the current PR is kicked off.



    + + To fix this issue, select a production job run to defer to instead of self. +
    +
    +
    +
    + Production job runs failing at the Clone Git Repository step +
    +
    dbt Cloud can only check out commits that belong to the original repository. dbt Cloud cannot checkout commits that belong to a fork of that repository.



    + + If you receive the following error message at the Clone Git Repository step of your job run:

    + + Error message:

    + Cloning into '/tmp/jobs/123456/target'...

    + Successfully cloned repository.

    + Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...

    + Failed to checkout to specified revision.

    + git checkout e845be54e6dc72342d5a8f814c8b3316ee220312

    + fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312

    +




    + + Double-check that your PR isn't trying to merge using a commit that belongs to a fork of the repository attached to your dbt project.
    +
    +
    +
    + CI job not triggering for Virtual Private dbt users +
    +
    To trigger jobs on dbt Cloud using the API, your Git provider needs to connect to your dbt Cloud account.



    + + If you're on a Virtual Private dbt Enterprise plan using security features like ingress PrivateLink or IP Allowlisting, registering CI hooks may not be available and can cause the job to fail silently.
    +
    +
    + + + diff --git a/website/docs/docs/deploy/continuous-integration.md b/website/docs/docs/deploy/continuous-integration.md index fbe28173ff6..cc856f97f22 100644 --- a/website/docs/docs/deploy/continuous-integration.md +++ b/website/docs/docs/deploy/continuous-integration.md @@ -1,57 +1,52 @@ --- title: "Continuous integration in dbt Cloud" sidebar_label: "Continuous integration" -description: "You can set up Slim continuous integration (CI) checks to test every single change prior to deploying the code to production just like in a software development workflow." +description: "You can set up continuous integration (CI) checks to test every single change prior to deploying the code to production just like in a software development workflow." --- -To implement a continuous integration (CI) workflow in dbt Cloud, you can set up automation that tests code changes by running [Slim CI jobs](/docs/deploy/slim-ci-jobs) before merging to production. dbt Cloud tracks the state of what’s running in your production environment so, when you run a Slim CI job, only the modified data assets in your pull request (PR) and their downstream dependencies are built and tested in a staging schema. You can also view the status of the CI checks (tests) directly from within the PR; this information is posted to your Git provider as soon as a Slim CI job completes. Additionally, you can enable settings in your Git provider that allow PRs only with successful CI checks be approved for merging. +To implement a continuous integration (CI) workflow in dbt Cloud, you can set up automation that tests code changes by running [CI jobs](/docs/deploy/ci-jobs) before merging to production. dbt Cloud tracks the state of what’s running in your production environment so, when you run a CI job, only the modified data assets in your pull request (PR) and their downstream dependencies are built and tested in a staging schema. You can also view the status of the CI checks (tests) directly from within the PR; this information is posted to your Git provider as soon as a CI job completes. Additionally, you can enable settings in your Git provider that allow PRs only with successful CI checks be approved for merging. -Using Slim CI helps: +Using CI helps: - Provide increased confidence and assurances that project changes will work as expected in production. - Reduce the time it takes to push code changes to production, through build and test automation, leading to better business outcomes. - Allow organizations to make code changes in a standardized and governed way that ensure code quality without sacrificing speed. -## How Slim CI works +## How CI works -When you [set up Slim CI jobs](/docs/deploy/slim-ci-jobs#set-up-slim-ci-jobs), dbt Cloud listens for webhooks from your Git provider indicating that a new PR has been opened or updated with new commits. When dbt Cloud receives one of these webhooks, it enqueues a new run of the Slim CI job. If you want CI checks to run on each new commit, you need to mark your PR as **Ready for review** in your Git provider — draft PRs _don't_ trigger CI jobs. +When you [set up CI jobs](/docs/deploy/ci-jobs#set-up-ci-jobs), dbt Cloud listens for webhooks from your Git provider indicating that a new PR has been opened or updated with new commits. When dbt Cloud receives one of these webhooks, it enqueues a new run of the CI job. If you want CI checks to run on each new commit, you need to mark your PR as **Ready for review** in your Git provider — draft PRs _don't_ trigger CI jobs. dbt Cloud builds and tests the models affected by the code change in a temporary schema, unique to the PR. This process ensures that the code builds without error and that it matches the expectations as defined by the project's dbt tests. The unique schema name follows the naming convention `dbt_cloud_pr__` (for example, `dbt_cloud_pr_1862_1704`) and can be found in the run details for the given run, as shown in the following image: -When the Slim CI run completes, you can view the run status directly from within the pull request. dbt Cloud updates the pull request in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run. The status message states whether the models and tests ran successfully or not. +When the CI run completes, you can view the run status directly from within the pull request. dbt Cloud updates the pull request in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run. The status message states whether the models and tests ran successfully or not. -dbt Cloud deletes the temporary schema from your  when you close or merge the pull request. If your project has database or schema customization using the [generate_database_name](/docs/build/custom-databases#generate_database_name) or [generate_schema_name](/docs/build/custom-schemas#how-does-dbt-generate-a-models-schema-name) macros, dbt Cloud might not drop the temporary schema from your data warehouse. For more information, refer to [Temp PR schema limitations](/docs/deploy/slim-ci-jobs#temp-pr-schema-limitations). +dbt Cloud deletes the temporary schema from your  when you close or merge the pull request. If your project has schema customization using the [generate_schema_name](/docs/build/custom-schemas#how-does-dbt-generate-a-models-schema-name) macro, dbt Cloud might not drop the temporary schema from your data warehouse. For more information, refer to [Troubleshooting](/docs/deploy/ci-jobs#troubleshooting). -## Differences between Slim CI jobs and other deployment jobs +## Differences between CI jobs and other deployment jobs -The [dbt Cloud scheduler](/docs/deploy/job-scheduler) executes Slim CI jobs differently from other deployment jobs in these important ways: +The [dbt Cloud scheduler](/docs/deploy/job-scheduler) executes CI jobs differently from other deployment jobs in these important ways: -- **Concurrent CI checks** — Slim CI runs triggered by the same dbt Cloud Slim CI job execute concurrently (in parallel), when appropriate -- **Smart cancellation of stale builds** — Automatically cancels stale, in-flight Slim CI runs when there are new commits to the PR -- **Run slot treatment** — Slim CI runs don't consume a run slot +- **Concurrent CI checks** — CI runs triggered by the same dbt Cloud CI job execute concurrently (in parallel), when appropriate +- **Smart cancellation of stale builds** — Automatically cancels stale, in-flight CI runs when there are new commits to the PR +- **Run slot treatment** — CI runs don't consume a run slot ### Concurrent CI checks -When you have teammates collaborating on the same dbt project creating pull requests on the same dbt repository, the same Slim CI job will get triggered. Since each run builds into a dedicated, temporary schema that’s tied to the pull request, dbt Cloud can safely execute Slim CI runs _concurrently_ instead of _sequentially_ (differing from what is done with deployment dbt Cloud jobs). Because no one needs to wait for one Slim CI run to finish before another one can start, with concurrent CI checks, your whole team can test and integrate dbt code faster. +When you have teammates collaborating on the same dbt project creating pull requests on the same dbt repository, the same CI job will get triggered. Since each run builds into a dedicated, temporary schema that’s tied to the pull request, dbt Cloud can safely execute CI runs _concurrently_ instead of _sequentially_ (differing from what is done with deployment dbt Cloud jobs). Because no one needs to wait for one CI run to finish before another one can start, with concurrent CI checks, your whole team can test and integrate dbt code faster. Below describes the conditions when CI checks are run concurrently and when they’re not: -- Slim CI runs with different PR numbers execute concurrently. -- Slim CI runs with the _same_ PR number and _different_ commit SHAs execute serially because they’re building into the same schema. dbt Cloud will run the latest commit and cancel any older, stale commits. For details, refer to [Smart cancellation of stale builds](#smart-cancellation). -- Slim CI runs with the same PR number and same commit SHA, originating from different dbt Cloud projects will execute jobs concurrently. This can happen when two CI jobs are set up in different dbt Cloud projects that share the same dbt repository. +- CI runs with different PR numbers execute concurrently. +- CI runs with the _same_ PR number and _different_ commit SHAs execute serially because they’re building into the same schema. dbt Cloud will run the latest commit and cancel any older, stale commits. For details, refer to [Smart cancellation of stale builds](#smart-cancellation). +- CI runs with the same PR number and same commit SHA, originating from different dbt Cloud projects will execute jobs concurrently. This can happen when two CI jobs are set up in different dbt Cloud projects that share the same dbt repository. ### Smart cancellation of stale builds {#smart-cancellation} -When you push a new commit to a PR, dbt Cloud enqueues a new Slim CI run for the latest commit and cancels any Slim CI run that is (now) stale and still in flight. This can happen when you’re pushing new commits while a CI build is still in process and not yet done. By cancelling runs in a safe and deliberate way, dbt Cloud helps improve productivity and reduce data platform spend on wasteful CI runs. - - - -### Run slot treatment - -Your Slim CI runs don't consume run slots so a CI check will never block a production run. +When you push a new commit to a PR, dbt Cloud enqueues a new CI run for the latest commit and cancels any CI run that is (now) stale and still in flight. This can happen when you’re pushing new commits while a CI build is still in process and not yet done. By cancelling runs in a safe and deliberate way, dbt Cloud helps improve productivity and reduce data platform spend on wasteful CI runs. + diff --git a/website/docs/docs/deploy/dbt-cloud-job.md b/website/docs/docs/deploy/dbt-cloud-job.md deleted file mode 100644 index fa9eead2d3b..00000000000 --- a/website/docs/docs/deploy/dbt-cloud-job.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: "dbt Cloud jobs" -id: "dbt-cloud-job" -description: "Manage, setup, and configure your dbt Cloud job using elegant job commands and triggers." -hide_table_of_contents: true -tags: ["scheduler"] ---- - -Manage, set up, and automate your dbt jobs using robust custom job settings. You can use the job scheduler to configure when and how your jobs run, helping you keep production data fresh on a timely basis. - -This portion of our documentation will go over dbt Cloud's various job settings using: - -- [Job settings](/docs/deploy/job-settings) — Intuitively navigate the user interface to create new dbt jobs or edit existing ones. -- [Job commands](/docs/deploy/job-commands) — Use job commands to configure dbt commands on a schedule. -- [Job triggers](/docs/deploy/job-triggers) — You can configure when and how dbt should run your job, such as: - * Running on scheduled days or cron schedules - * Setting up continuous integration (CI) to run when someone opens a new pull request in your dbt repository - * Using the API to trigger jobs - - - - - - - - diff --git a/website/docs/docs/deploy/deploy-environments.md b/website/docs/docs/deploy/deploy-environments.md index da54b918436..bdcf36b7a30 100644 --- a/website/docs/docs/deploy/deploy-environments.md +++ b/website/docs/docs/deploy/deploy-environments.md @@ -4,19 +4,19 @@ id: "deploy-environments" description: "Learn about dbt Cloud's deployment environment to seamlessly schedule jobs or enable CI." --- -Deployment environments in dbt Cloud are crucial for deploying dbt jobs. To execute dbt, environments determine the settings used during job runs, including: +Deployment environments in dbt Cloud are crucial for deploying dbt jobs in production and using features or integrations that depend on dbt metadata or results. To execute dbt, environments determine the settings used during job runs, including: - The version of dbt Core that will be used to run your project - The warehouse connection information (including the target database/schema settings) - The version of your code to execute -A dbt Cloud project can have multiple deployment environments, providing you the flexibility and customization to tailor the execution of dbt jobs. You can use deployment environments to [create and schedule jobs](/docs/deploy/job-settings#create-and-schedule-jobs), [enable continuous integration](/docs/deploy/continuous-integration), or more based on your specific needs or requirements. +A dbt Cloud project can have multiple deployment environments, providing you the flexibility and customization to tailor the execution of dbt jobs. You can use deployment environments to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs), [enable continuous integration](/docs/deploy/continuous-integration), or more based on your specific needs or requirements. :::tip Learn how to manage dbt Cloud environments To learn different approaches to managing dbt Cloud environments and recommendations for your organization's unique needs, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview). ::: - -This page will go over the different types of environments and how to intuitively configure your deployment environment in dbt Cloud. + +This page reviews the different types of environments and how to configure your deployment environment in dbt Cloud. import CloudEnvInfo from '/snippets/_cloud-environments-info.md'; @@ -28,9 +28,19 @@ To create a new dbt Cloud development environment, navigate to **Deploy** -> **E +### Set as production environment (Beta) + +import ExpBeta from '/snippets/_explorer-beta-banner.md'; + + + + + +In dbt Cloud, each project can have one designated deployment environment, which serves as its production environment. This production environment is _essential_ for using features like dbt Explorer and cross-project references. It acts as the source of truth for the project's production state in dbt Cloud. + ### Semantic Layer -For Semantic Layer-eligible customers, the next section of environment settings is the Semantic Layer configurations. [The Semantic Layer setup guide](/docs/use-dbt-semantic-layer/setup-dbt-semantic-layer) has the most up-to-date setup instructions! +For Semantic Layer-eligible customers, the next section of environment settings is the Semantic Layer configurations. [The Semantic Layer setup guide](/docs/use-dbt-semantic-layer/setup-sl) has the most up-to-date setup instructions! ### Deployment connection @@ -181,7 +191,7 @@ This section allows you to determine the credentials that should be used when co ## Related docs - [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) -- [Deploy dbt jobs](/docs/deploy/dbt-cloud-job) -- [Deploy CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [CI jobs](/docs/deploy/continuous-integration) - [Delete a job or environment in dbt Cloud](/faqs/Environments/delete-environment-job) diff --git a/website/docs/docs/deploy/deploy-jobs.md b/website/docs/docs/deploy/deploy-jobs.md new file mode 100644 index 00000000000..e43020bf66e --- /dev/null +++ b/website/docs/docs/deploy/deploy-jobs.md @@ -0,0 +1,101 @@ +--- +title: "Deploy jobs" +description: "Learn how to create and schedule deploy jobs in dbt Cloud for the scheduler to run. When you run with dbt Cloud, you get built-in observability, logging, and alerting." +tags: [scheduler] +--- + +You can use deploy jobs to build production data assets. Deploy jobs make it easy to run dbt commands against a project in your cloud data platform, triggered either by schedule or events. Each job run in dbt Cloud will have an entry in the job's run history and a detailed run overview, which provides you with: + +- Job trigger type +- Commit SHA +- Environment name +- Sources and documentation info, if applicable +- Job run details, including run timing, [model timing data](#model-timing), and [artifacts](/docs/deploy/artifacts) +- Detailed run steps with logs and their run step statuses + +You can create a deploy job and configure it to run on [scheduled days and times](#schedule-days) or enter a [custom cron schedule](#custom-cron-schedules). + + +## Prerequisites + +- You must have a dbt Cloud account and [Developer seat license](/docs/cloud/manage-access/seats-and-users). If you don't, you can [sign up](https://www.getdbt.com/signup/) for a [free account](https://www.getdbt.com/pricing/). +- You must have a dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections). +- You must have [access permission](/docs/cloud/manage-access/about-user-access) to view, create, modify, or run jobs. +- You must set up a [deployment environment](/docs/deploy/deploy-environments). + +## Create and schedule jobs {#create-and-schedule-jobs} + +1. On your deployment environment page, click **Create Job** > **Deploy Job** to create a new deploy job. +2. Options in the **Job Description** section: + - **Job Name** — Specify the name for the deploy job. For example, `Daily build`. + - **Environment** — By default, it’s set to the deployment environment you created the deploy job from. +3. Options in the **Execution Settings** section: + - **Commands** — By default, it includes the `dbt build` command. Click **Add command** to add more [commands](/docs/deploy/job-commands) that you want to be invoked when the job runs. + - **Generate docs on run** — Enable this option if you want to [generate project docs](/docs/collaborate/build-and-view-your-docs) when this deploy job runs. + - **Run source freshness** — Enable this option to invoke the `dbt source freshness` command before running the deploy job. Refer to [Source freshness](/docs/deploy/source-freshness) for more details. +4. Options in the **Schedule** section: + - **Run on schedule** — Enable this option to run the deploy job on a set schedule. + - **Timing** — Specify whether to [schedule](#schedule-days) the deploy job using **Frequency** that runs the job at specific times of day, **Specific Intervals** that runs the job every specified number of hours, or **Cron Schedule** that runs the job specified using [cron syntax](#custom-cron-schedule). + - **Days of the Week** — By default, it’s set to every day when **Frequency** or **Specific Intervals** is chosen for **Timing**. + + + +5. (optional) Options in the **Advanced Settings** section: + - **Environment Variables** — Define [environment variables](/docs/build/environment-variables) to customize the behavior of your project when the deploy job runs. + - **Target Name** — Define the [target name](/docs/build/custom-target-names) to customize the behavior of your project when the deploy job runs. Environment variables and target names are often used interchangeably. + - **Run Timeout** — Cancel the deploy job if the run time exceeds the timeout value. + - **Compare changes against** — By default, it’s set to **No deferral**. Select either **Environment** or **This Job** to let dbt Cloud know what it should compare the changes against. + + :::info + Older versions of dbt Cloud only allow you to defer to a specific job instead of an environment. Deferral to a job compares state against the project code that was run in the deferred job's last successful run. While deferral to an environment is more efficient as dbt Cloud will compare against the project representation (which is stored in the `manifest.json`) of the last successful deploy job run that executed in the deferred environment. By considering _all_ deploy jobs that run in the deferred environment, dbt Cloud will get a more accurate, latest project representation state. + ::: + + - **dbt Version** — By default, it’s set to inherit the [dbt version](/docs/dbt-versions/core) from the environment. dbt Labs strongly recommends that you don't change the default setting. This option to change the version at the job level is useful only when you upgrade a project to the next dbt version; otherwise, mismatched versions between the environment and job can lead to confusing behavior. + - **Threads** — By default, it’s set to 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. + + + +### Schedule days + +To set your job's schedule, use the **Schedule Days** option to choose specific days of the week, and select customized hours or intervals. + +Under **Timing**, you can either use customizable hours for jobs that need to run frequently throughout the day or exact intervals for jobs that need to run at specific times: + +- **Every n hours** — Use this option to set how often your job runs, in hours. Enter a number between 1 and 23 to represent the interval between job runs. For example, if you set it to "every 2 hours", the job will run every 2 hours from midnight UTC. This option is useful if you need to run jobs multiple times per day at regular intervals. + +- **At exact intervals** — Use this option to set specific times when your job should run. You can enter a comma-separated list of hours (in UTC) when you want the job to run. For example, if you set it to `0,12,23,` the job will run at midnight, noon, and 11 PM UTC. This option is useful if you want your jobs to run at specific times of day and don't need them to run more frequently than once a day. + +:::info + +dbt Cloud uses [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) (UTC) and does not account for translations to your specific timezone or take into consideration daylight savings time. For example: + +- 0 means 12am (midnight) UTC +- 12 means 12pm (afternoon) UTC +- 23 means 11pm UTC + +::: + +### Custom cron schedule + +To fully customize the scheduling of your job, choose the **Custom cron schedule** option and use the cron syntax. With this syntax, you can specify the minute, hour, day of the month, month, and day of the week, allowing you to set up complex schedules like running a job on the first Monday of each month. + + + + +Use tools such as [crontab.guru](https://crontab.guru/) to generate the correct cron syntax. This tool allows you to input cron snippets and returns their plain English translations. + +Refer to the following example snippets: + + +- `0 * * * *`: Every hour, at minute 0 +- `*/5 * * * *`: Every 5 minutes +- `5 4 * * *`: At exactly 4:05 AM UTC +- `30 */4 * * *`: At minute 30 past every 4th hour (e.g. 4:30AM, 8:30AM, 12:30PM, etc., all UTC) +- `0 0 */2 * *`: At midnight UTC every other day +- `0 0 * * 1`: At midnight UTC every Monday. + +## Related docs + +- [Artifacts](/docs/deploy/artifacts) +- [Continuous integration (CI) jobs](/docs/deploy/ci-jobs) +- [Webhooks](/docs/deploy/webhooks) diff --git a/website/docs/docs/deploy/deployment-overview.md b/website/docs/docs/deploy/deployment-overview.md index dddc252211e..5883ecaa3f1 100644 --- a/website/docs/docs/deploy/deployment-overview.md +++ b/website/docs/docs/deploy/deployment-overview.md @@ -29,21 +29,15 @@ Learn how to use dbt Cloud's features to help your team ship timely and quality icon="dbt-bit"/> -
    - -## dbt Cloud jobs - -
    - - -

    ## Monitor jobs and alerts @@ -77,7 +65,7 @@ Learn how to use dbt Cloud's features to help your team ship timely and quality icon="dbt-bit"/> diff --git a/website/docs/docs/deploy/deployment-tools.md b/website/docs/docs/deploy/deployment-tools.md index 26e9e4ea317..80622880c2c 100644 --- a/website/docs/docs/deploy/deployment-tools.md +++ b/website/docs/docs/deploy/deployment-tools.md @@ -4,7 +4,7 @@ id: "deployment-tools" sidebar_label: "Integrate with other tools" --- -Alongside [dbt Cloud](/docs/deploy/dbt-cloud-job), discover other ways to schedule and run your dbt jobs with the help of tools such as Airflow, Prefect, Dagster, automation server, Cron, and Azure Data Factory (ADF), +Alongside [dbt Cloud](/docs/deploy/jobs), discover other ways to schedule and run your dbt jobs with the help of tools such as Airflow, Prefect, Dagster, automation server, Cron, and Azure Data Factory (ADF), Build and install these tools to automate your data workflows, trigger dbt jobs (including those hosted on dbt Cloud), and enjoy a hassle-free experience, saving time and increasing efficiency. @@ -16,7 +16,7 @@ If your organization is using [Airflow](https://airflow.apache.org/), there are -Installing the [dbt Cloud Provider](https://registry.astronomer.io/providers/dbt-cloud) to orchestrate dbt Cloud jobs. This package contains multiple Hooks, Operators, and Sensors to complete various actions within dbt Cloud. +Installing the [dbt Cloud Provider](https://airflow.apache.org/docs/apache-airflow-providers-dbt-cloud/stable/index.html) to orchestrate dbt Cloud jobs. This package contains multiple Hooks, Operators, and Sensors to complete various actions within dbt Cloud. @@ -30,7 +30,7 @@ Invoking dbt Core jobs through the [BashOperator](https://registry.astronomer.io -For more details on both of these methods, including example implementations, check out [this guide](https://www.astronomer.io/guides/airflow-dbt). +For more details on both of these methods, including example implementations, check out [this guide](https://docs.astronomer.io/learn/airflow-dbt-cloud). ## Azure Data Factory @@ -109,6 +109,10 @@ If your organization is using [Prefect](https://www.prefect.io/), the way you wi If your organization is using [Dagster](https://dagster.io/), you can use the [dagster_dbt](https://docs.dagster.io/_apidocs/libraries/dagster-dbt) library to integrate dbt commands into your pipelines. This library supports the execution of dbt through dbt Cloud, dbt CLI and the dbt RPC server. Running dbt from Dagster automatically aggregates metadata about your dbt runs. Refer to the [example pipeline](https://dagster.io/blog/dagster-dbt) for details. +## Kestra + +If your organization uses [Kestra](http://kestra.io/), you can leverage the [dbt plugin](https://kestra.io/plugins/plugin-dbt) to orchestrate dbt Cloud and dbt Core jobs. Kestra's user interface (UI) has built-in [Blueprints](https://kestra.io/docs/user-interface-guide/blueprints), providing ready-to-use workflows. Navigate to the Blueprints page in the left navigation menu and [select the dbt tag](https://demo.kestra.io/ui/blueprints/community?selectedTag=36) to find several examples of scheduling dbt CLI commands and dbt Cloud jobs as part of your data pipelines. After each scheduled or ad-hoc workflow execution, the Outputs tab in the Kestra UI allows you to download and preview all dbt build artifacts. The Gantt and Topology view additionally render the metadata to visualize dependencies and runtimes of your dbt models and tests. The dbt Cloud task provides convenient links to easily navigate between Kestra and dbt Cloud UI. + ## Automation servers Automation servers, like CodeDeploy, GitLab CI/CD ([video](https://youtu.be/-XBIIY2pFpc?t=1301)), Bamboo and Jenkins, can be used to schedule bash commands for dbt. They also provide a UI to view logging to the command line, and integrate with your git repository. diff --git a/website/docs/docs/deploy/job-commands.md b/website/docs/docs/deploy/job-commands.md index acdc3a00228..ca26182fc7b 100644 --- a/website/docs/docs/deploy/job-commands.md +++ b/website/docs/docs/deploy/job-commands.md @@ -77,7 +77,6 @@ Job command failures can mean different things for different commands. Some comm ## Related docs - [Job creation best practices](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) - [dbt Command reference](/reference/dbt-commands) -- [Job triggers](/docs/deploy/job-triggers) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) - [Build and view your docs](/docs/collaborate/build-and-view-your-docs) diff --git a/website/docs/docs/deploy/job-notifications.md b/website/docs/docs/deploy/job-notifications.md index c240ca12183..8d242abac78 100644 --- a/website/docs/docs/deploy/job-notifications.md +++ b/website/docs/docs/deploy/job-notifications.md @@ -9,16 +9,24 @@ Setting up notifications in dbt Cloud will allow you to receive alerts via Email ### Email -There are two options for setting up email notifications. As a **user**, you can set up email notifications for yourself under your Profile. As an **admin**, you can set up notifications on behalf of your team members. +These are the following options for setting up email notifications. Refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users) for info on license types eligible for email notifications. -1. Click the gear in the top right and select **Notification settings**. +- As a **user** — You can set up email notifications for yourself under your Profile. +- As an **admin** — You can set up notifications on behalf of your team members. -2. **As a user:** Select **Edit** and select the type of Notification (Succeeds, Fails, or Is Cancelled) for each Job for which you would like to be notified, or +To set up job notifications, follow these steps: - **As an admin:** Select one or more users you'd like to set notifications for. If you only see your own name, then you might not have admin privileges. Select **Edit** and select the type of Notification (Succeeds, Fails, or Is Cancelled) for each Job for which they will be notified. +1. Click the gear menu in the top right corner and select **Notification Settings**. + +2. Select **Edit** to begin editing the **Email Notifications** settings. + - **As a user:** Choose the Notification type (Succeeds, Fails, or Is Cancelled) for each Job you want to receive notifications for. + + - **As an admin:** Under **Configure notifications for**, use the dropdown to select one or more users you'd like to set notifications for. If you only see your own name, then you might not have admin privileges.

    + Choose the Notification type (Succeeds, Fails, or Is Cancelled) for each Job you want them to receive notifications for. 3. Click **Save**. - + + ### Slack diff --git a/website/docs/docs/deploy/job-scheduler.md b/website/docs/docs/deploy/job-scheduler.md index 03eeb6fb377..8ade670f1cc 100644 --- a/website/docs/docs/deploy/job-scheduler.md +++ b/website/docs/docs/deploy/job-scheduler.md @@ -82,7 +82,7 @@ The scheduler prevents queue clog by canceling runs that aren't needed, ensuring -To prevent over-scheduling, users will need to take action by either refactoring the job so it runs faster or modifying its [schedule](/docs/deploy/job-triggers). +To prevent over-scheduling, users will need to take action by either refactoring the job so it runs faster or modifying its [schedule](/docs/deploy/deploy-jobs#schedule-days). ## Related docs - [dbt Cloud architecture](/docs/cloud/about-cloud/architecture#about-dbt-cloud-architecture) diff --git a/website/docs/docs/deploy/job-settings.md b/website/docs/docs/deploy/job-settings.md deleted file mode 100644 index 3b53880bddf..00000000000 --- a/website/docs/docs/deploy/job-settings.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Job settings" -description: "Learn how to create and schedule jobs in dbt Cloud for the scheduler to run. Jobs help you build observability into transformation workflows with the in-app scheduling, logging, and alerting." -tags: [scheduler] ---- - -Jobs make it easy to run dbt commands against a project in your cloud data platform, triggered either by schedule or events. Each job run in dbt Cloud will have a run history, run status, and a run overview, which provides you with: - -- Job trigger type -- Commit SHA -- Environment name -- Sources and documentation info -- Job run details, including run timing, [model timing data](#model-timing), and [artifacts](/docs/deploy/artifacts) -- Detailed run steps with logs and their statuses - -You can create a job and configure it to run on [scheduled days and times](/docs/deploy/job-triggers#schedule-days) or enter a [custom cron schedule](/docs/deploy/job-triggers#custom-cron-schedules). - -## Prerequisites - -- You must have a dbt Cloud account and [Developer seat license](/docs/cloud/manage-access/seats-and-users). If you don't, you can [sign up](https://www.getdbt.com/signup/) for a [free account](https://www.getdbt.com/pricing/). -- You must have a dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections). -- You must [create and schedule a dbt Cloud job](#create-and-schedule-jobs). -- You must have [access permission](/docs/cloud/manage-access/about-user-access) to view, create, modify, or run jobs. -- You must set up a [deployment environment](/docs/deploy/deploy-environments). - -## Create and schedule jobs {#create-and-schedule-jobs} - -1. Create a new job by clicking **Deploy** in the header, click **Jobs** and then **Create job**. -1. Provide a job name, for example "Hourly Customer Job". -1. Under **Environment**, add the following: - * **Environment** — Link to an existing deployment environment. - * **dbt Version** — Select the dbt [version](/docs/dbt-versions/core). dbt Labs recommends inheriting the version from the environment settings. - * **Target Name** — Define the [target name](/docs/build/custom-target-names) for any dbt cloud job to correspond to settings in your project. - * **Threads** — The default value is 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. - -1. Define [environment variables](/docs/build/environment-variables) if you want to customize the behavior of your project. - - - -5. Under **Execution Settings**, you can configure the fields needed to execute your job: - - * **Run Timeout** — Configure the number of seconds a run will execute before dbt Cloud cancels it. Setting this to 0 means it'll never time out runs for that job. - * **Defer to a previous run state** — Select a production job you want to defer to. This enables dbt Cloud to examine the artifacts from the most recent, successful run of that deferred job, enabling state comparison and rewiring of upstream dependencies to any model that doesn’t exist in the current run's schema.  - * **Generate docs on run** checkbox — Configure the job to automatically [generate project docs](/docs/collaborate/build-and-view-your-docs) each time this job runs. - * **Run on source freshness** checkbox — Configure [dbt source freshness](/docs/deploy/source-freshness) as the first step of the job without breaking subsequent steps. - * **Commands** — Add or remove [job commands](/docs/deploy/job-commands), which are specific tasks you set in your dbt Cloud jobs. - - - -6. Under the **Triggers** section, you can configure when and how dbt will trigger the job. Refer to [job triggers](/docs/deploy/job-triggers) for more details. - - * **Schedule** tab — Use the **Run on schedule** toggle to configure your job to run on [scheduled](/docs/deploy/job-triggers#schedule-days) days and time, or enter a [custom cron schedule](/docs/deploy/job-triggers#custom-cron-schedules). - * **Continuous Integration** tab — Configure [continuous integration (CI)](/docs/deploy/continuous-integration) to run when someone opens a new pull request in your dbt repository. - * **API** tab — Use the [dbt API](/docs/dbt-cloud-apis/overview) to trigger a job. - - - -7. Select **Save**, then click **Run Now** to run your job. Click the run and watch its progress under **Run history**. diff --git a/website/docs/docs/deploy/job-triggers.md b/website/docs/docs/deploy/job-triggers.md deleted file mode 100644 index cb7a1a48088..00000000000 --- a/website/docs/docs/deploy/job-triggers.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -title: "Job triggers" -id: "job-triggers" -description: "You can configure when and how dbt should run your job" ---- - -In dbt Cloud, you can use the options under **Triggers** to configure when and how dbt should [run your job](/docs/deploy/job-triggers#schedule-job): - -- **Schedule** tab — Use the **Run on schedule** toggle to configure your job to run on either [scheduled days](#schedule-days) or [custom cron-powered schedule](#custom-cron-schedule) -- **Continuous Integration (CI)** tab — Configure [continuous integration](/docs/deploy/continuous-integration) to run when someone opens a new pull request in your dbt repository -- **API** tab — Use the [API](/docs/dbt-cloud-apis/admin-cloud-api) to trigger a job or send events to other systems - - - -## Schedule jobs - -To schedule your job to run at specific days, times, and intervals: -1. Go to the specific job settings, click **Edit**, then go to the **Triggers** section -2. Go to the **Schedule** tab, and toggle **Run on schedule** -3. Use either the [scheduled days](#schedule-days) or the [custom cron-powered schedule](#custom-cron-schedule) method to customize your desired days, times, and intervals. - -### Schedule days - -To set your job's schedule, use the **Schedule Days** option to choose specific days of the week, and select customized hours or intervals. - -Under **Timing**, you can either use customizable hours for jobs that need to run frequently throughout the day or exact intervals for jobs that need to run at specific times: - -- **Every n hours** — Use this option to set how often your job runs, in hours. Enter a number between 1 and 23 to represent the interval between job runs. For example, if you set it to "every 2 hours", the job will run every 2 hours from midnight UTC. This option is useful if you need to run jobs multiple times per day at regular intervals. - -- **At exact intervals** — Use this option to set specific times when your job should run. You can enter a comma-separated list of hours (in UTC) when you want the job to run. For example, if you set it to `0,12,23,` the job will run at midnight, noon, and 11 PM UTC. This option is useful if you want your jobs to run at specific times of day and don't need them to run more frequently than once a day. - -:::info - -dbt Cloud uses [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) (UTC) and does not account for translations to your specific timezone or take into consideration daylight savings time. For example: - -- 0 means 12am (midnight) UTC -- 12 means 12pm (afternoon) UTC -- 23 means 11pm UTC - -::: - -### Custom cron schedule - -To fully customize the scheduling of your job, choose the **Custom cron schedule** option and use the "cron" syntax. With this syntax, you can specify the minute, hour, day of the month, month, and day of the week, allowing you to set up complex schedules like running a job on the first Monday of each month. - - - - -Use tools such as [crontab.guru](https://crontab.guru/) to generate the correct cron syntax. This tool allows you to input cron snippets and returns their plain English translations. - -Refer to the following example snippets: - - -- `0 * * * *`: Every hour, at minute 0 -- `*/5 * * * *`: Every 5 minutes -- `5 4 * * *`: At exactly 4:05 AM UTC -- `30 */4 * * *`: At minute 30 past every 4th hour (e.g. 4:30AM, 8:30AM, 12:30PM, etc., all UTC) -- `0 0 */2 * *`: At midnight UTC every other day -- `0 0 * * 1`: At midnight UTC every Monday. - - -## Related docs - -- [Artifacts](/docs/deploy/artifacts) -- [Build and view your docs with dbt Cloud](/docs/collaborate/build-and-view-your-docs) -- [Source freshness](/docs/deploy/source-freshness) -- [Job commands](/docs/deploy/job-commands) -- [Webhooks for your jobs](/docs/deploy/webhooks) \ No newline at end of file diff --git a/website/docs/docs/deploy/jobs.md b/website/docs/docs/deploy/jobs.md new file mode 100644 index 00000000000..e8ca864d65f --- /dev/null +++ b/website/docs/docs/deploy/jobs.md @@ -0,0 +1,21 @@ +--- +title: "Jobs in dbt Cloud" +sidebar_label: "Jobs" +description: "Learn about deploy jobs and continuous integration (CI) jobs in dbt Cloud and what their differences are." +tags: [scheduler] +--- + +In dbt Cloud, there are two types of jobs: +- [Deploy jobs](/docs/deploy/deploy-jobs) — To create and set up triggers for building production data assets +- [Continuous integration (CI) jobs](/docs/deploy/continuous-integration) — To create and set up triggers for checking code changes + +Below is a comparison table that describes how deploy jobs and CI jobs behave differently: + +| | Deploy Jobs | CI Jobs | +| --- | --- | --- | +| Purpose | Builds production data assets. | Builds and tests new code before merging changes into production. | +| Trigger types | Triggered by a schedule or by API. | Triggered by a commit to a PR or by API. | +| Destination | Builds into a production database and schema. | Builds into a staging database and ephemeral schema, lived for the lifetime of the PR. | +| Execution mode | Runs execute sequentially, so as to not have collisions on the underlying DAG. | Runs execute in parallel to promote team velocity. | +| Efficiency run savings | Detects over-scheduled jobs and cancels unnecessary runs to avoid queue clog. | Cancels existing runs when a newer commit is pushed to avoid redundant work. | +| State comparison | Only sometimes needs to detect state. | Almost always needs to compare state against the production environment to build on modified code and its dependents. | \ No newline at end of file diff --git a/website/docs/docs/deploy/slim-ci-jobs.md b/website/docs/docs/deploy/slim-ci-jobs.md deleted file mode 100644 index 35fa3eff46c..00000000000 --- a/website/docs/docs/deploy/slim-ci-jobs.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -title: "Slim CI jobs in dbt Cloud" -sidebar_label: "Slim CI jobs" -description: "Learn how to create and set up Slim CI checks to test code changes before deploying to production." ---- - -You can set up Slim [continuous integration](/docs/deploy/continuous-integration) (CI) jobs to run when someone opens a new pull request in your dbt repository. By running and testing only _modified_ models — which is what _slim_ refers to — dbt Cloud ensures these jobs are as efficient and resource conscientious as possible on your data platform. - -## Prerequisites - -- You have a dbt Cloud account. - - For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). -- You must be connected using dbt Cloud’s native integration with [GitHub account](/docs/cloud/git/connect-github), [GitLab account](/docs/cloud/git/connect-gitlab), or [Azure DevOps account](/docs/cloud/git/connect-azure-devops). - - If you’re using GitLab, you must use a paid or self-hosted account which includes support for GitLab webhooks. - - If you previously configured your dbt project by providing a generic git URL that clones using SSH, you must reconfigure the project to connect through dbt Cloud's native integration. - -## Set up Slim CI jobs - -dbt Labs recommends that you create your Slim CI job in a dedicated dbt Cloud [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. Having a separate environment dedicated for CI will provide better isolation between your temporary CI schemas builds and your production data builds. Additionally, sometimes teams need their Slim CI jobs to be triggered when a PR is made to a branch other than main. If your team maintains a staging branch in your release process, having a separate environment will allow you to set a [custom branch](/faqs/environments/custom-branch-settings), and accordingly the CI job in that dedicated environment will be triggered only when PRs are made to the specified, custom branch. - -1. On your deployment environment page, click **Create One** to create a new CI job. -2. In the **Execution Settings** section: - - For the option **Defer to a previous run state**, choose whichever production job that's set to run often. If you don't see any jobs to select from the dropdown, you first need to run a production job successfully. Deferral tells dbt Cloud to compare the manifest of the current CI job against the project representation that was materialized the last time the deferred job was run successfully. By setting this option, dbt Cloud only checks the modified code and compares the changes against what’s running in production, instead of building the full table or the entire DAG. - - - - - For the option **Commands**, enter `dbt build --select state:modified+` in the field. This informs dbt Cloud to build only new or changed models and their downstream dependents. Importantly, state comparison can only happen when there is a deferred job selected to compare state to. - - -3. In the **Triggers** section, choose the **Continuous Integration** (CI) tab. Then, enable the **Run on Pull Requests** option. This configures pull requests and new commits to be a trigger for the Slim CI job. - - -## Example pull requests - -The green checkmark means the dbt build and tests were successful. Clicking on the dbt Cloud section navigates you to the relevant CI run in dbt Cloud. - -### GitHub pull request example - - - -### GitLab pull request example - - - -### Azure DevOps pull request example - - - - -## Troubleshooting - -If you're experiencing any issues, review some of the common questions and answers below. - -
    - Reconnecting your dbt project to use dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps -
    -
    If your dbt project relies the generic git clone method that clones using SSH and deploy keys to connect to your dbt repo, you need to disconnect your repo and reconnect it using the native GitHub, GitLab, or Azure DevOps integration in order to enable dbt Cloud Slim CI.



    - First, make sure you have the native GitHub authentication, native GitLab authentication, or native Azure DevOps authentication set up depending on which git provider you use. After you have gone through those steps, go to Account Settings, select Projects and click on the project you'd like to reconnect through native GitHub, GitLab, or Azure DevOps auth. Then click on the repository link.



    - - Once you're in the repository page, select Edit and then Disconnect Repository at the bottom.

    - -

    - Confirm that you'd like to disconnect your repository. You should then see a new Configure a repository link in your old repository's place. Click through to the configuration page:

    - -

    - Select the GitHub, GitLab, or AzureDevOps tab and reselect your repository. That should complete the setup of the project and enable you to set up a dbt Cloud CI job.
    -
    -
    -
    - Error messages that refer to schemas from previous PRs -
    -
    If you receive a schema-related error message referencing a previous PR, this is usually an indicator that you are not using a production job for your deferral and are instead using self. If the prior PR has already been merged, the prior PR's schema may have been dropped by the time the Slim CI job for the current PR is kicked off.



    - - To fix this issue, select a production job run to defer to instead of self. -
    -
    -
    -
    - Production job runs failing at the Clone Git Repository step -
    -
    dbt Cloud can only checkout commits that belong to the original repository. dbt Cloud _cannot_ checkout commits that belong to a fork of that repository.



    - - If you receive the following error message at the Clone Git Repository step of your job run:

    - - Error message:

    - Cloning into '/tmp/jobs/123456/target'...

    - Successfully cloned repository.

    - Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...

    - Failed to checkout to specified revision.

    - git checkout e845be54e6dc72342d5a8f814c8b3316ee220312

    - fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312

    -




    - - Double-check that your PR isn't trying to merge using a commit that belongs to a fork of the repository attached to your dbt project.
    -
    -
    -
    - CI job not triggering for Virtual Private dbt users -
    -
    To trigger jobs on dbt Cloud using the API, your Git provider needs to connect to your dbt Cloud account.



    - - If you're on a Virtual Private dbt Enterprise plan using security features like ingress PrivateLink or IP Allowlisting, registering CI hooks may not be available and can cause the job to fail silently.
    -
    -
    - -### Temp PR schema limitations - -If your temporary pull request schemas aren't dropping after a merge or close of the PR, it's likely due to the below scenarios. Open and review the toggles below for recommendations on how to resolve this: - -
    - You used dbt Cloud environment variables in your connection settings page -
    -
    To resolve this, remove environment variables in your connections settings.
    -
    -
    -
    - You have an empty/blank default schema -
    -
    To change this, edit and fill in your default schema.
    -
    -
    -
    - You have overridden the generate_schema_name macro -
    -
    To resolve this, change your macro so that the temporary PR schema name contains the default prefix and review the guidance below: -

    - • ✅ Temporary PR schema name contains the prefix dbt_cloud_pr_ (like dbt_cloud_pr_123_456_marketing)

    - • ❌ Temporary PR schema name doesn't contain the prefix dbt_cloud_pr_ (like marketing).

    -
    -
    -
    -
    - You have overridden the generate_database_name macro -
    -
    If you assume that the project's default connection is to a database named analytics, review the guidance below to resolve this: -

    - • ✅ Database remains the same as the connection default (like analytics)

    - • ❌ Database has changed from the default connection (like dev).

    -
    -
    -
    diff --git a/website/docs/docs/deploy/webhooks.md b/website/docs/docs/deploy/webhooks.md index 4ce089daa89..069e7a3e283 100644 --- a/website/docs/docs/deploy/webhooks.md +++ b/website/docs/docs/deploy/webhooks.md @@ -18,7 +18,7 @@ You can create webhooks for these events from the [dbt Cloud web-based UI](#crea dbt Cloud retries sending each event five times. dbt Cloud keeps a log of each webhook delivery for 30 days. Every webhook has its own **Recent Deliveries** section, which lists whether a delivery was successful or failed at a glance. -A webhook in dbt Cloud has a timeout of 10 seconds. This means that if the endpoint doesn't respond within 10 seconds, the webhook processor will time out. +A webhook in dbt Cloud has a timeout of 10 seconds. This means that if the endpoint doesn't respond within 10 seconds, the webhook processor will time out. This can result in a situation where the client responds successfully after the 10 second timeout and records a success status while the dbt cloud webhooks system will interpret this as a failure. :::tip Videos If you're interested in course learning with videos, check out the [Webhooks on-demand course](https://courses.getdbt.com/courses/webhooks) from dbt Labs. @@ -167,7 +167,7 @@ An example of a webhook payload for an errored run: You can use the dbt Cloud API to create new webhooks that you want to subscribe to, get detailed information about your webhooks, and to manage the webhooks that are associated with your account. The following sections describe the API endpoints you can use for this. :::info Access URLs -dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. People on Enterprise plans can choose to have their account hosted in any one of these regions. This section uses `cloud.getdbt.com` (which is for North America) as part of the endpoint but your access URL might be different. For a complete list of available dbt Cloud access URLs, refer to [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. People on Enterprise plans can choose to have their account hosted in any one of these regions. For a complete list of available dbt Cloud access URLs, refer to [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). ::: ### List all webhook subscriptions @@ -175,12 +175,13 @@ List all webhooks that are available from a specific dbt Cloud account. #### Request ```shell -GET https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscriptions +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscriptions ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhooks are associated with. | #### Response sample @@ -265,11 +266,12 @@ Get detailed information about a specific webhook. #### Request ```shell -GET https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want detailed information on. | @@ -322,7 +324,7 @@ Create a new outbound webhook and specify the endpoint URL that will be subscrib #### Request sample ```shell -POST https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscriptions +POST https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscriptions ``` ```json @@ -344,6 +346,7 @@ POST https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription #### Path parameters | Name | Description | | --- | --- | +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | #### Request parameters @@ -407,7 +410,7 @@ Update the configuration details for a specific webhook. #### Request sample ```shell -PUT https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +PUT https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} ``` ```json @@ -429,6 +432,7 @@ PUT https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/ #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want to update. | @@ -491,12 +495,13 @@ Test a specific webhook. #### Request ```shell -GET https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id}/test +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id}/test ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want to test. | @@ -518,12 +523,13 @@ Delete a specific webhook. #### Request ```shell -DELETE https://cloud.getdbt.com/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +DELETE https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} ``` #### Path parameters | Name | Description | |------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | | `account_id` | The dbt Cloud account the webhook is associated with. | | `webhook_id` | The webhook you want to delete. | diff --git a/website/docs/docs/supported-data-platforms.md b/website/docs/docs/supported-data-platforms.md index be6c454d746..2d9197ddf22 100644 --- a/website/docs/docs/supported-data-platforms.md +++ b/website/docs/docs/supported-data-platforms.md @@ -8,73 +8,35 @@ pagination_next: "docs/connect-adapters" pagination_prev: null --- -dbt connects to and runs SQL against your database, warehouse, lake, or query engine. These SQL-speaking platforms are collectively referred to as _data platforms_. dbt connects with data platforms by using a dedicated adapter plugin for each. Plugins are built as Python modules that dbt Core discovers if they are installed on your system. Read [What are Adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) for more info. +dbt connects to and runs SQL against your database, warehouse, lake, or query engine. These SQL-speaking platforms are collectively referred to as _data platforms_. dbt connects with data platforms by using a dedicated adapter plugin for each. Plugins are built as Python modules that dbt Core discovers if they are installed on your system. Read [What are Adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) for more info. -You can [connect](/docs/connect-adapters) to adapters and data platforms either directly in the dbt Cloud user interface (UI) or install them manually using the command line (CLI). There are two types of adapters available and to evaluate quality and maintenance, we recommend you consider their verification status. You can also [further configure](/reference/resource-configs/postgres-configs) your specific data platform to optimize performance. +You can [connect](/docs/connect-adapters) to adapters and data platforms either directly in the dbt Cloud user interface (UI) or install them manually using the command line (CLI). -- **Verified** — dbt Labs' strict [adapter program](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter) assures users of trustworthy, tested, and regularly updated adapters for production use. Verified adapters earn a "Verified" status, providing users with trust and confidence. -- **Community** — [Community adapters](/docs/community-adapters) are open-source and maintained by community members. +You can also further customize how dbt works with your specific data platform via configuration: see [Configuring Postgres](/reference/resource-configs/postgres-configs) for an example. + +## Types of Adapters + +There are three types of adapters available today: + +- **Verified** — [Verified adapters](verified-adapters) are those that have completed a rigorous verification process in collaboration with dbt Labs. +- **Trusted** — [Trusted adapters](trusted-adapters) are those where the adapter maintainers have agreed to meet a higher standard of quality. +- **Community** — [Community adapters](community-adapters) are open-source and maintained by community members. ### Verified adapters The following are **Verified adapters** ✓ you can connect to either in dbt Cloud or dbt Core: -
    - - - - - - - - - - - - - - - - - - - - - -
    - -
    -* Install these adapters using the CLI as they're not currently supported in dbt Cloud.
    +import AdaptersVerified from '/snippets/_adapters-verified.md'; + + + +### Trusted adapters + +The following are **Trusted adapters** ✓ you can connect to in dbt Core: + +import AdaptersTrusted from '/snippets/_adapters-trusted.md'; + + + +
    * Install these adapters using the CLI as they're not currently supported in dbt Cloud.
    + diff --git a/website/docs/docs/trusted-adapters.md b/website/docs/docs/trusted-adapters.md new file mode 100644 index 00000000000..e19bb40785f --- /dev/null +++ b/website/docs/docs/trusted-adapters.md @@ -0,0 +1,41 @@ +--- +title: "Trusted adapters" +id: "trusted-adapters" +hide_table_of_contents: true +--- + +Trusted adapters are adapters not maintained by dbt Labs, that we feel comfortable recommending to users for use in production. + +Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a *good thing*, however it requires due dilligence that wasn't required in a paid-license, closed-source software world. As a user, there are questions to answer important before taking a dependency on an open-source project. The trusted adapter designation is meant to streamline this process for end users. + +
    Considerations for depending on an open-source project + +1. Does it work? +2. Does anyone "own" the code, or is anyone liable for ensuring it works? +3. Do bugs get fixed quickly? +4. Does it stay up-to-date with new Core features? +5. Is the usage substantial enough to self-sustain? +pendency on this library? + +
    + +### Trusted adapter specifications + +See [Building a Trusted Adapter](/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter) for more information, particularly if you are an adapter maintainer considering having your adapter be added to the trusted list. + +### Trusted vs Verified + +The Verification program exists to highlight adapters that meets both of the following criteria: + +- the guidelines given in the Trusted program, +- formal agreements required for integration with dbt Cloud + +For more information on the Verified Adapter program, reach out the [dbt Labs partnerships team](mailto:partnerships@dbtlabs.com) + +### Trusted adapters + +The following are **Trusted adapters** ✓ you can connect to in dbt Core: + +import AdaptersTrusted from '/snippets/_adapters-trusted.md'; + + diff --git a/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md b/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md index 8c004d865bb..b084dedc305 100644 --- a/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md +++ b/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md @@ -1,17 +1,47 @@ --- title: "Available integrations" id: avail-sl-integrations -description: "Review a wide range of partners you can integrate and query with the dbt Semantic Layer." +description: "Discover the diverse range of partners that seamlessly integrate with the powerful dbt Semantic Layer, allowing you to query and unlock valuable insights from your data ecosystem." +tags: [Semantic Layer] sidebar_label: "Available integrations" +meta: + api_name: dbt Semantic Layer APIs --- -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. + -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. +import NewSLChanges from '/snippets/_new-sl-changes.md'; -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. -::: + + + +There are a number of data applications that seamlessly integrate with the dbt Semantic Layer, powered by MetricFlow, from business intelligence tools to notebooks, spreadsheets, data catalogs, and more. These integrations allow you to query and unlock valuable insights from your data ecosystem. + +Use the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) to simplify metric queries, optimize your development workflow, and reduce coding. This approach also ensures data governance and consistency for data consumers. + + + + +import AvailIntegrations from '/snippets/_sl-partner-links.md'; + + + +### Custom integration + +You can create custom integrations using different languages and tools. We support connecting with JDBC, ADBC, and a GraphQL API. For more info, check out [our examples on GitHub](https://github.com/dbt-labs/example-semantic-layer-clients/). + +## Related docs + +- {frontMatter.meta.api_name} to learn how to integrate with JDBC and GraphQL to query your metrics in downstream tools. +- [dbt Semantic Layer APIs query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) + + + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + A wide variety of data applications across the modern data stack natively integrate with the dbt Semantic Layer and dbt metrics — from Business Intelligence tools to notebooks, data catalogs, and more. @@ -19,13 +49,10 @@ The dbt Semantic Layer integrations are capable of querying dbt metrics, importi For information on the partner integrations, their documentation, and more — refer to the [dbt Semantic Layer integrations](https://www.getdbt.com/product/semantic-layer-integrations) page. - - + ## Related docs -- [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer) to learn more about the dbt Semantic Layer. -- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more information on plan availability. -- [Public Preview information](/docs/use-dbt-semantic-layer/quickstart-semantic-layer#public-preview) to understand what Public Preview for the dbt Semantic Layer means. -- [dbt Semantic Layer partner integration guide](/guides/dbt-ecosystem/sl-partner-integration-guide) for information about partner integration guidelines, product roadmap, and API connectivity. -- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) to understand best practices for designing and structuring metrics in your dbt project. +- [About the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) + + diff --git a/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md deleted file mode 100644 index 95962610f8d..00000000000 --- a/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -title: "dbt Semantic Layer" -id: dbt-semantic-layer -description: "Introducing the dbt Semantic Layer" -sidebar_label: "dbt Semantic Layer" ---- - -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. - -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. - -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. -::: - -The dbt Semantic Layer allows data teams to centrally define essential business metrics like `revenue`, `customer`, and `churn` in the modeling layer (your dbt project) for consistent self-service within downstream data tools like BI and metadata management solutions. The dbt Semantic Layer provides the flexibility to define metrics on top of your existing models and then query those metrics and models in your analysis tools of choice. - -The result? You have less duplicate coding for data teams and more consistency for data consumers. - -The dbt Semantic Layer has four main parts: - -- Define your metrics in version-controlled dbt project code using MetricFlow -- Import your metric definitions via the [Discovery API](/docs/dbt-cloud-apis/discovery-api) -- Query your metric data via the dbt Proxy Server -- Explore and analyze dbt metrics in downstream tools - - - - -### What makes the dbt Semantic Layer different? - -The dbt Semantic Layer reduces code duplication and inconsistency regarding your business metrics. By moving metric definitions out of the BI layer and into the modeling layer, data teams can feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. You can also use the dbt Semantic Layer to query models and use macros. - - -## Prerequisites -To use the dbt Semantic Layer, you’ll need to meet the following: - - - - - -## Public Preview - -The dbt Semantic Layer is currently available for Public Preview, which means: - -— **Who?** The dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise) during Public Preview. Review [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more info on plan availability. - -- Team and Enterprise accounts will be able to set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated -partner tool to import metric definition. -- Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse dbt metrics in external tools, which requires access to the Discovery API. - -— **What?** Public Previews provide early access to new features. The Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality that is not backwards compatible. dbt Labs provides support, and relevant service level objectives (SLOs) apply. We will introduce pricing for the dbt Semantic Layer alongside the General Available (GA) release (future GA date to be announced). - -— **When?** Public Preview will end once the dbt Semantic Layer is available for GA. After GA, the dbt Semantic Layer will only be available to dbt Cloud **Team** and **Enterprise** plans. - -— **Where?** Public Preview is enabled at the account level so you don’t need to worry about enabling it per user. - -## Product architecture - -The dbt Semantic Layer product architecture includes four primary components: - -| Components | Information | Developer plans | Team plans | Enterprise plans | License | -| --- | --- | :---: | :---: | :---: | --- | -| **[dbt project](/docs/build/metrics)** | Define models and metrics in dbt Core. | ✅ | ✅ | ✅ | Open source, Core | -| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | -| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅

    _* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | -| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise | - - - -dbt Semantic Layer integrations will: - -- Leverage the Discovery API to fetch a list of objects and their attributes, like metrics -- Generate a dbt-SQL statement -- Then query the SQL proxy to evaluate the results of this statement - - -## Manage metrics - -:::info 📌 - -New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. - -::: - -If you're not sure whether to define a metric in dbt or not, ask yourself the following: - -> *Is this something our teams consistently need to report on?* - -An important business metric should be: - -- Well-defined (the definition is agreed upon throughout the entire organization) -- Time-bound (able to be compared across time) - -A great example of this is **revenue** — it can be aggregated on multiple levels (weekly, monthly, etc) and is key for the broader business to understand. - -- ✅ `Monthly recurring revenue` or `Weekly active users` or `Average order value` -- ❌ `1-off experimental metric` - - -### Design and define metrics - -**Design metrics** -To read about best practices on structuring and organizing your metrics, review our [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog post first. - -**Define metrics** -You can define your metrics in `.yml` files nested under a metrics key and to design or define your own metrics in your dbt project, review the following documents:
    - -- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog to understand best practices for designing and structuring metrics in your dbt project -- [dbt metrics](docs/build/metrics) for in-depth detail on attributes, filters, how to define and query your metrics and [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics) -- [dbt Semantic Layer quickstart](/docs/use-dbt-semantic-layer/quickstart-semantic-layer) to get started -- [Understanding the components of the dbt Semantic Layer](https://docs.getdbt.com/blog/understanding-the-components-of-the-dbt-semantic-layer) blog post to see further examples - -Review our helpful metrics video below, which explains what metrics are, why they're important and how you can get started: - - - -## Related questions - -
    - How are you storing my data? -
    -
    The dbt Semantic Layer does not store, or cache, or log your data. On each query to the Semantic Layer, the resulting data passes through dbt Cloud servers where it is never stored, cached, or logged. The data from your data platform gets routed through dbt Cloud servers, to your connecting data tool.
    -
    -
    -
    - Is the dbt Semantic Layer open source? -
    -
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



    - -During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise).



    - -
      -
    • dbt Core users can define metrics in their dbt Core projects and calculate them using macros from the metrics package. To use the dbt Semantic Layer integrations, users will need to have a dbt Cloud account.


    • -
    • Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API.


    • -
    • Team and Enterprise accounts will be able to set up the Semantic Layer and Discovery API in the integrated partner tool to import metric definition.
    • -
    -
    -
    - Is there a dbt Semantic Layer discussion hub? -
    -
    Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. -
    -
    -
    -

    diff --git a/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md b/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md new file mode 100644 index 00000000000..76753b41ffa --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md @@ -0,0 +1,163 @@ +--- +title: "dbt Semantic Layer" +id: dbt-sl +description: "Learn how the dbt Semantic Layer enables data teams to centrally define and query metrics." +sidebar_label: "About the dbt Semantic Layer" +tags: [Semantic Layer] +hide_table_of_contents: true +--- + + + +import NewSLChanges from '/snippets/_new-sl-changes.md'; + + + + +The dbt Semantic Layer, powered by [MetricFlow](/docs/build/about-metricflow), simplifies the process of defining and using critical business metrics, like `revenue` in the modeling layer (your dbt project). By centralizing metric definitions, data teams can ensure consistent self-service access to these metrics in downstream data tools and applications. The dbt Semantic Layer eliminates duplicate coding by allowing data teams to define metrics on top of existing models and automatically handles data joins. + +Moving metric definitions out of the BI layer and into the modeling layer allows data teams to feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. + +Refer to the [Why we need a universal semantic layer](https://www.getdbt.com/blog/universal-semantic-layer/) blog post to learn more. + +## Explore the dbt Semantic Layer + + +import Features from '/snippets/_sl-plan-info.md' + + + +
    + + + + + + + + + + + +
    + +
    + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + +The dbt Semantic Layer allows your data teams to centrally define essential business metrics like `revenue`, `customer`, and `churn` in the modeling layer (your dbt project) for consistent self-service within downstream data tools like BI and metadata management solutions. The dbt Semantic Layer provides the flexibility to define metrics on top of your existing models and then query those metrics and models in your analysis tools of choice. + +Resulting in less duplicate coding for data teams and more consistency for data consumers. + +The dbt Semantic Layer has these main parts: + +- Define your metrics in version-controlled dbt project code using [MetricFlow](/docs/build/about-metricflow) + * dbt_metrics is now deprecated +- Import your metric definitions using the [Discovery API](/docs/dbt-cloud-apis/discovery-api) +- Query your metric data with the dbt Proxy Server +- Explore and analyze dbt metrics in downstream tools + +### What makes the dbt Semantic Layer different? + +The dbt Semantic Layer reduces code duplication and inconsistency regarding your business metrics. By moving metric definitions out of the BI layer and into the modeling layer, your data teams can feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. You can also use the dbt Semantic Layer to query models and use macros. + + +## Prerequisites + + + + + + +## Manage metrics + +:::info 📌 + +New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. + +::: + +If you're not sure whether to define a metric in dbt or not, ask yourself the following: + +> *Is this something our teams consistently need to report on?* + +An important business metric should be: + +- Well-defined (the definition is agreed upon throughout the entire organization) +- Time-bound (able to be compared across time) + +A great example of this is **revenue**. It can be aggregated on multiple levels (weekly, monthly, and so on) and is key for the broader business to understand. + +- ✅ `Monthly recurring revenue` or `Weekly active users` or `Average order value` +- ❌ `1-off experimental metric` + + +### Design and define metrics + +You can design and define your metrics in `.yml` files nested under a metrics key in your dbt project. For more information, refer to these docs:
    + +- [dbt metrics](docs/build/metrics) for in-depth detail on attributes, filters, how to define and query your metrics, and [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics) +- [dbt Semantic Layer quickstart](/docs/use-dbt-semantic-layer/quickstart-semantic-layer) to get started + +## Related questions + +
    + How do I migrate from the legacy Semantic Layer to the new one? +
    +
    If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
    +
    +
    + +
    + How are you storing my data? +
    +
    The dbt Semantic Layer doesn't store, cache, or log your data. On each query to the Semantic Layer, the resulting data passes through dbt Cloud servers where it's never stored, cached, or logged. The data from your data platform gets routed through dbt Cloud servers to your connecting data tool.
    +
    +
    +
    + Is the dbt Semantic Layer open source? +
    +
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL-licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



    + +During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers — Developer, Team, and Enterprise.



    + +
    +
    +
    + Is there a dbt Semantic Layer discussion hub? +
    +
    Yes, absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. +
    +
    +
    +

    +
    diff --git a/website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md similarity index 58% rename from website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md rename to website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md index af8de189fa9..3bbc11cea3f 100644 --- a/website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md +++ b/website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md @@ -1,45 +1,148 @@ --- -title: "Quickstart" -id: quickstart-semantic-layer -description: "Define metrics and set up the dbt Semantic Layer" -sidebar_label: "Quickstart" +title: "Get started with the dbt Semantic Layer" +id: quickstart-sl +description: "Use this guide to build and define metrics, set up the dbt Semantic Layer, and query them using the Semantic Layer APIs." +sidebar_label: "Get started with the dbt Semantic Layer" +tags: [Semantic Layer] +meta: + api_name: dbt Semantic Layer APIs --- -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. + -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. +import NewSLChanges from '/snippets/_new-sl-changes.md'; +import InstallMetricFlow from '/snippets/_sl-install-metricflow.md'; +import CreateModel from '/snippets/_sl-create-semanticmodel.md'; +import DefineMetrics from '/snippets/_sl-define-metrics.md'; +import ConfigMetric from '/snippets/_sl-configure-metricflow.md'; +import TestQuery from '/snippets/_sl-test-and-query-metrics.md'; -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. + + + + +The dbt Semantic Layer, powered by [MetricFlow](/docs/build/about-metricflow), simplifies defining and using critical business metrics. It centralizes metric definitions, eliminates duplicate coding, and ensures consistent self-service access to metrics in downstream tools. + +MetricFlow, a powerful component of the dbt Semantic Layer, simplifies the creation and management of company metrics. It offers flexible abstractions, SQL query generation, and enables fast retrieval of metric datasets from a data platform. + +Use this guide to fully experience the power of the universal dbt Semantic Layer. Here are the following steps you'll take: + +- [Create a semantic model](#create-a-semantic-model) in dbt Cloud using MetricFlow +- [Define metrics](#define-metrics) in dbt Cloud using MetricFlow +- [Test and query metrics locally](#test-and-query-metrics) using MetricFlow +- [Run a production job](#run-a-production-job) in dbt Cloud +- [Set up dbt Semantic Layer](#setup) in dbt Cloud +- [Connect and query API](#connect-and-query-api) with dbt Cloud + + +MetricFlow allows users to define metrics in their dbt project whether in dbt Cloud or in dbt Core. dbt Core users can use the [MetricFlow CLI](/docs/build/metricflow-cli) to define metrics in their local dbt Core project. + +However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. +## Prerequisites + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +:::tip +New to dbt or metrics? Try our [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) to help you get started! ::: -## Public Preview - -We're excited to announce the dbt Semantic Layer is currently available for Public Preview, which means: +## Create a semantic model + + + +## Define metrics + + + +## Test and query metrics + + + +## Run a production job + +Once you’ve defined metrics in your dbt project, you can perform a job run in your deployment environment in dbt Cloud to materialize your metrics. The deployment environment is only supported for the dbt Semantic Layer at this moment. + +1. Go to **Deploy** in the navigation header +2. Select **Jobs** to re-run the job with the most recent code in the deployment environment. +3. Your metric should appear as a red node in the dbt Cloud IDE and dbt directed acyclic graphs (DAG). + + + + +
    + +What’s happening internally? +- Merging the code into your main branch allows dbt Cloud to pull those changes and builds the definition in the manifest produced by the run.
    +- Re-running the job in the deployment environment helps materialize the models, which the metrics depend on, in the data platform. It also makes sure that the manifest is up to date.
    +- The Semantic Layer APIs pulls in the most recent manifest and allows your integration information to extract metadata from it. +
    + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + + +## Connect and query API -— **Who?** The dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise) during Public Preview. Review [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more info on plan availability. +You can query your metrics in a JDBC-enabled tool or use existing first-class integrations with the dbt Semantic Layer. -- Team and Enterprise accounts will be able to set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated -partner tool to import metric definition. -- Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse dbt metrics in external tools, which requires access to the Discovery API. +You must have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment, hosted in North America (Additional region support coming soon). -— **What?** Public Previews provide early access to new features. The Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality that is not backwards compatible. dbt Labs provides support, and relevant service level objectives (SLOs) apply. We will introduce pricing for the dbt Semantic Layer alongside the General Available (GA) release (future GA date to be announced). +- To learn how to use the JDBC or GraphQL API and what tools you can query it with, refer to the {frontMatter.meta.api_name}.
    + + * To authenticate, you need to [generate a service token](/docs/dbt-cloud-apis/service-tokens) with Semantic Layer Only and Metadata Only permissions. + * Refer to the [SQL query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the APIs. + +- To learn more about the sophisticated integrations that connect to the dbt Semantic Layer, refer to [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) for more info. + + +## FAQs + +If you're encountering some issues when defining your metrics or setting up the dbt Semantic Layer, check out a list of answers to some of the questions or problems you may be experiencing. -— **When?** Public Preview will end once the dbt Semantic Layer is available for GA. After GA, the dbt Semantic Layer will only be available to dbt Cloud **Team** and **Enterprise** plans. +
    + How do I migrate from the legacy Semantic Layer to the new one? +
    +
    If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
    +
    +
    +
    +How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
    +
    +Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary, however, some components of the dbt Semantic Layer are open source, like dbt-core and MetricFlow.

    dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE or the MetricFlow CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users will must be on a dbt Cloud Team or Enterprise plan.
    +

    -— **Where?** Public Preview is enabled at the account level so you don’t need to worry about enabling it per user. +## Next steps + +- [Build your metrics](/docs/build/build-metrics-intro) +- [Set up dbt Semantic Layer](docs/use-dbt-semantic-layer/setup-dbt-sl) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) + +
    + -## Introduction +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + To try out the features of the dbt Semantic Layer, you first need to have a dbt project set up. This quickstart guide will lay out the following steps, and recommends a workflow that demonstrates some of its essential features: -- Install dbt metrics package +- Install dbt metrics package + * Note: this package will be deprecated very soon and we highly recommend you to use the new [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl?version=1.6), available in dbt v 1.6 or higher. - Define metrics - Query, and run metrics - Configure the dbt Semantic Layer ## Prerequisites + To use the dbt Semantic Layer, you’ll need to meet the following: @@ -54,8 +157,8 @@ New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build ::: ## Installing dbt metrics package -The dbt Semantic Layer supports the calculation of metrics by using the [dbt metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/). You can install the dbt metrics package in your dbt project by copying the below code blocks. +The dbt Semantic Layer supports the calculation of metrics by using the [dbt metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/). You can install the dbt metrics package in your dbt project by copying the below code blocks. @@ -77,16 +180,6 @@ packages: - - -```yml -packages: - - package: dbt-labs/metrics - version: [">=0.2.0", "<0.3.0"] -``` - - - 1. Paste the dbt metrics package code in your `packages.yml` file. 2. Run the [`dbt deps` command](/reference/commands/deps) to install the package. @@ -101,11 +194,6 @@ Review our helpful metrics video below, which explains what metrics are, why the -### Design metrics - -To read about best practices on structuring and organizing your metrics, review our [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog post first. - -### Define metrics Now that you've organized your metrics folder and files, you can define your metrics in `.yml` files nested under a `metrics` key. 1. Add the metric definitions found in the [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example to your dbt project. For example, to add an expenses metric, reference the following metrics you can define directly in your metrics folder: @@ -176,9 +264,7 @@ metrics: 2. Commit and merge the code changes that contain the metric definitions. 3. If you'd like to further design and define your own metrics, review the following documentation: - - [dbt metrics](/docs/build/metrics) will povide you in-depth detail on attributes, properties, filters, and how to define and query metrics. - - - Review [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog to understand best practices for designing and structuring metrics in your dbt project. + - [dbt metrics](/docs/build/metrics) will provide you in-depth detail on attributes, properties, filters, and how to define and query metrics. ## Develop and query metrics @@ -226,7 +312,7 @@ If you're encountering some issues when defining your metrics or setting up the
    Is the dbt Semantic Layer open source?
    -
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



    +
    Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL-licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open sources.



    During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise).



      @@ -295,7 +381,7 @@ The reason you're experiencing this error is because we changed the type diff --git a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md deleted file mode 100644 index 9d0c1eee752..00000000000 --- a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "Set up the dbt Semantic Layer" -id: setup-dbt-semantic-layer -description: "You can set up the dbt Semantic Layer in dbt Cloud." -sidebar_label: "Set up the dbt Semantic Layer" ---- - -:::info Coming soon -The dbt Semantic Layer is undergoing a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. - -**What’s changing?** The dbt_metrics package will be [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. - -**What's new?** Learn how to [Build your metrics](/docs/build/build-metrics-intro?version=1.6) using MetricFlow, one of the key components that makes up the revamped dbt Semantic Layer. It handles SQL query construction and defines the specification for dbt semantic models and metrics. -::: - -With the dbt Semantic Layer, you'll be able to centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. - -## Prerequisites - -Before you set up the dbt Semantic Layer, make sure you meet the following: - - - - - - - -## Set up dbt Semantic Layer - - - -
      - - -## Related docs - -- [Integrated partner tools](https://www.getdbt.com/product/semantic-layer-integrations) for info on the different integration partners and their documentation -- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) page for more information on plan availability -- [dbt metrics](/docs/build/metrics) for in-depth detail on attributes, properties, filters, and how to define and query metrics -- [dbt Server repo](https://github.com/dbt-labs/dbt-server), which is a persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations diff --git a/website/docs/docs/use-dbt-semantic-layer/setup-sl.md b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md new file mode 100644 index 00000000000..a2395d367e7 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md @@ -0,0 +1,102 @@ +--- +title: "Set up the dbt Semantic Layer" +id: setup-sl +description: "Seamlessly set up the dbt Semantic Layer in dbt Cloud using intuitive navigation." +sidebar_label: "Set up your Semantic Layer" +tags: [Semantic Layer] +--- + + + +import NewSLChanges from '/snippets/_new-sl-changes.md'; + + + +With the dbt Semantic Layer, you can centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. + +## Prerequisites + + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + + + + + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + +With the dbt Semantic Layer, you can define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. + +## Prerequisites + + + + +## Set up dbt Semantic Layer + +:::tip +If you're using the legacy Semantic Layer, dbt Labs strongly recommends that you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the latest dbt Semantic Layer. Refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. + +::: + + * Team and Enterprise accounts can set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated partner tool to import metric definitions. + * Developer accounts can query the Proxy Server using SQL but won't be able to browse dbt metrics in external tools, which requires access to the Discovery API. + + +1. Log in to your dbt Cloud account. +2. Go to **Account Settings**, and then **Service Tokens** to create a new [service account API token](/docs/dbt-cloud-apis/service-tokens). Save your token somewhere safe. +3. Assign permissions to service account tokens depending on the integration tool you choose. Refer to the [integration partner documentation](https://www.getdbt.com/product/semantic-layer-integrations) to determine the permission sets you need to assign. +4. Go to **Deploy** > **Environments**, and select your **Deployment** environment. +5. Click **Settings** on the top right side of the page. +6. Click **Edit** on the top right side of the page. +7. Select dbt version 1.2 or higher. +8. Toggle the Semantic Layer **On**. +9. Copy the full proxy server URL (like `https://eagle-hqya7.proxy.cloud.getdbt.com`) to connect to your [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations). +10. Use the URL in the data source configuration of the integrated partner tool. +11. Use the data platform login credentials that make sense for how the data is consumed. + +:::info📌 + +It is _not_ recommended that you use your dbt Cloud credentials due to elevated permissions. Instead, you can use your specific integration tool permissions. + +::: + +12. Set up the [Discovery API](/docs/dbt-cloud-apis/discovery-api) (Team and Enterprise accounts only) in the integrated partner tool to import the metric definitions. The [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations) will treat the dbt Server as another data source (like a data platform). This requires: + +- The account ID, environment ID, and job ID (which is visible in the job URL) +- An [API service token](/docs/dbt-cloud-apis/service-tokens) with job admin and metadata permissions +- Add the items above to the relevant fields in your integration tool + + +
      + +
      + +## Related docs + +- [Build your metrics](/docs/build/build-metrics-intro) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) +- [Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) +- [Migrate your legacy Semantic Layer](/guides/migration/sl-migration) +- [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) diff --git a/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md new file mode 100644 index 00000000000..89cd9bc6ddc --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md @@ -0,0 +1,80 @@ +--- +title: "dbt Semantic Layer architecture" +id: sl-architecture +description: "dbt Semantic Layer product architecture and related questions." +sidebar_label: "Architecture" +tags: [Semantic Layer] +--- + +import NewSLChanges from '/snippets/_new-sl-changes.md'; + + + + + + +The dbt Semantic Layer allows you to define metrics and use various interfaces to query them. The Semantic Layer does the heavy lifting to find where the queried data exists in your data platform and generates the SQL to make the request (including performing joins). + + + +## dbt Semantic Layer components + +The dbt Semantic Layer includes the following components: + + +| Components | Information | Developer plans | Team plans | Enterprise plans | License | +| --- | --- | :---: | :---: | :---: | --- | +| **[MetricFlow](/docs/build/about-metricflow)** | MetricFlow in dbt allows users to centrally define their semantic models and metrics with YAML specifications. | ✅ | ✅ | ✅ | BSL package (code is source available) | +| **MetricFlow Server**| A proprietary server that takes metric requests and generates optimized SQL for the specific data platform. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)| +| **Semantic Layer Gateway** | A service that passes queries to MetricFlow server and executes the SQL generated by MetricFlow against the data platform|

      ❌| ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | +| **Semantic Layer API** | The interfaces that allow users to submit metric queries include the MetricFlow CLI and JDBC API. They also serve as the foundation for building first-class integrations with various tools. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)| + + +## Related questions + +
      + How do I migrate from the legacy Semantic Layer to the new one? +
      +
      If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
      +
      +
      + +
      +How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
      +
      +Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary, however, some components of the dbt Semantic Layer are open source, like dbt-core and MetricFlow.

      The universal dbt Semantic Layer is available to all Team and Enterprise Plans during public beta. Users on dbt Cloud Developer plans or dbt Core users can use MetricFlow to only define and test metrics locally.
      +
      + Is there a dbt Semantic Layer discussion hub? +
      +
      Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. +
      +
      +
      + +
      + + + +## Product architecture + +The dbt Semantic Layer product architecture includes four primary components: + +| Components | Information | Developer plans | Team plans | Enterprise plans | License | +| --- | --- | :---: | :---: | :---: | --- | +| **[dbt project](/docs/build/metrics)** | Define models and metrics in dbt Core.
      *Note, we will deprecate and no longer support the dbt_metrics package. | ✅ | ✅ | ✅ | Open source, Core | +| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | +| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅

      _* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | +| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | + + + +dbt Semantic Layer integrations will: + +- Leverage the Discovery API to fetch a list of objects and their attributes, like metrics +- Generate a dbt-SQL statement +- Then query the SQL proxy to evaluate the results of this statement + +
      diff --git a/website/docs/docs/verified-adapters.md b/website/docs/docs/verified-adapters.md index 9604d05391c..a2d28a612d6 100644 --- a/website/docs/docs/verified-adapters.md +++ b/website/docs/docs/verified-adapters.md @@ -1,30 +1,20 @@ --- title: "Verified adapters" id: "verified-adapters" +hide_table_of_contents: true --- -The dbt Labs has a rigorous verified adapter program which provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers aspects of development, documentation, user experience, and maintenance. +The dbt Labs has a rigorous verified adapter program that provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers development, documentation, user experience, and maintenance aspects. These adapters then earn a "Verified" status so that users can have a certain level of trust and expectation when they use them. The adapters also have maintainers and we recommend using the adapter's verification status to determine its quality and health. -Here's the list of the verified data platforms that can connect to dbt and its latest version. - -| dbt Cloud setup | CLI installation | latest verified version | -| ---------------- | ----------------------------------------- | ------------------------ | -| [Setup AlloyDB](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install AlloyDB](/docs/core/connect-data-platform/alloydb-setup) | (same as `dbt-postgres`) | -| Not supported | [Install Azure Synapse](/docs/core/connect-data-platform/azuresynapse-setup) | 1.3 :construction: | -| [Set up BigQuery](/docs/cloud/connect-data-platform/connect-bigquery) | [Install BigQuery](/docs/core/connect-data-platform/bigquery-setup) | 1.4 | -| [Set up Databricks ](/docs/cloud/connect-data-platform/connect-databricks)| [ Install Databricks](/docs/core/connect-data-platform/databricks-setup) | 1.4 | -| Not supported | [Install Dremio](/docs/core/connect-data-platform/dremio-setup) | 1.4 :construction: | -| [Set up Postgres](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install Postgres](/docs/core/connect-data-platform/postgres-setup) | 1.4 | -| [Set up Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) | [Install Redshift](/docs/core/connect-data-platform/redshift-setup) | 1.4 | -| [Set up Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) | [ Install Snowflake](/docs/core/connect-data-platform/snowflake-setup) | 1.4 | -| [Set up Spark](/docs/cloud/connect-data-platform/connect-apache-spark) | [Install Spark](/docs/core/connect-data-platform/spark-setup) | 1.4 | -| [Set up Starburst & Trino](/docs/cloud/connect-data-platform/connect-starburst-trino)| [Installl Starburst & Trino](/docs/core/connect-data-platform/trino-setup) | 1.4 | - -:construction:: Verification in progress +The verification process serves as the on-ramp to integration with dbt Cloud. As such, we restrict applicants to data platform vendors with whom we are already engaged. To learn more, see [Verifying a new adapter](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter). +Here are the verified data platforms that connect to dbt and its latest version. + +import AdaptersVerified from '/snippets/_adapters-verified.md'; + diff --git a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md index 76d03870478..f8daf393f9b 100644 --- a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md +++ b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md @@ -38,7 +38,7 @@ To unlock your account and select a plan, review the following guidance per plan 2. To unlock your account and continue using the Team plan, you need to enter your payment details. 3. Go to **Payment Information** and click **Edit** on the right. 4. Enter your payment details and click **Save**. -5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉 +5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉 @@ -59,7 +59,7 @@ For commonly asked billings questions, refer to the dbt Cloud [pricing page](htt
      How does billing work?
      -
      Team plans are billed monthly on the credit card used to sign up, based on developer seat count. You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings -> Billing page.



      +
      Team plans are billed monthly on the credit card used to sign up, based on [developer seat count and usage](/docs/cloud/billing). You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings > Billing page.



      Enterprise plan customers are billed annually based on the number of developer seats, as well as any additional services + features in your chosen plan.
      @@ -75,7 +75,7 @@ For commonly asked billings questions, refer to the dbt Cloud [pricing page](htt
      Can I pay by invoice?
      -
      At present, dbt Cloud Team plan payments must be made via credit card, and by default they will be billed monthly based on the number of developer seats.



      +
      Currently, dbt Cloud Team plan payments must be made with a credit card, and by default they will be billed monthly based on the number of [developer seats and usage](/docs/cloud/billing).



      We don’t have any plans to do invoicing for Team plan accounts in the near future, but we do currently support invoices for companies on the dbt Cloud Enterprise plan. Feel free to contact us to build your Enterprise pricing plan.
      diff --git a/website/docs/faqs/Accounts/payment-accepted.md b/website/docs/faqs/Accounts/payment-accepted.md index 2e26063c684..c0e949833a2 100644 --- a/website/docs/faqs/Accounts/payment-accepted.md +++ b/website/docs/faqs/Accounts/payment-accepted.md @@ -5,6 +5,6 @@ sidebar_label: 'Can I pay invoice' id: payment-accepted --- -Presently for Team plans, self-service dbt Cloud payments must be made via credit card and by default, they will be billed monthly based on the number of active developer seats. +Currently for Team plans, self-service dbt Cloud payments must be made with a credit card and by default, they will be billed monthly based on the number of [active developer seats and usage](/docs/cloud/billing). We don't have any plans to do invoicing for self-service teams in the near future, but we *do* currently support invoices for companies on the **dbt Cloud Enterprise plan.** Feel free to [contact us](https://www.getdbt.com/contact) to build your Enterprise pricing. diff --git a/website/docs/faqs/Core/install-python-compatibility.md b/website/docs/faqs/Core/install-python-compatibility.md index d24466f4990..5c536101f0c 100644 --- a/website/docs/faqs/Core/install-python-compatibility.md +++ b/website/docs/faqs/Core/install-python-compatibility.md @@ -17,18 +17,12 @@ The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3. - + The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.9, and 3.10 - - -As of v1.0, `dbt-core` is compatible with Python versions 3.7, 3.8, and 3.9. - - - Adapter plugins and their dependencies are not always compatible with the latest version of Python. For example, dbt-snowflake v0.19 is not compatible with Python 3.9, but dbt-snowflake versions 0.20+ are. New dbt minor versions will add support for new Python3 minor versions as soon as all dependencies can support it. In turn, dbt minor versions will drop support for old Python3 minor versions right before they reach [end of life](https://endoflife.date/python). diff --git a/website/docs/faqs/Docs/documenting-macros.md b/website/docs/faqs/Docs/documenting-macros.md index cbc12b988c6..9a2036cd6bf 100644 --- a/website/docs/faqs/Docs/documenting-macros.md +++ b/website/docs/faqs/Docs/documenting-macros.md @@ -5,8 +5,6 @@ sidebar_label: 'Document macros' id: documenting-macros --- -The `macros:` key is new in 0.16.0. - To document macros, use a [schema file](/reference/macro-properties) and nest the configurations under a `macros:` key ## Example diff --git a/website/docs/faqs/Docs/modify-owner-column.md b/website/docs/faqs/Docs/modify-owner-column.md index db06e5af6cf..8395a182bb9 100644 --- a/website/docs/faqs/Docs/modify-owner-column.md +++ b/website/docs/faqs/Docs/modify-owner-column.md @@ -8,7 +8,7 @@ id: modify-owner-column Due to the nature of the field, you won't be able to change the owner column in your generated documentation. -The _owner_ field in `dbt-docs` is pulled from database metdata (`catalog.json`), meaning the owner of that table in the database. With the exception of exposures, it's not pulled from an `owner` field set within dbt. +The _owner_ field in `dbt-docs` is pulled from database metadata (`catalog.json`), meaning the owner of that table in the database. With the exception of exposures, it's not pulled from an `owner` field set within dbt. Generally, dbt's database user owns the tables created in the database. Source tables are usually owned by the service responsible for ingesting/loading them. diff --git a/website/docs/faqs/Environments/beta-release.md b/website/docs/faqs/Environments/beta-release.md deleted file mode 100644 index 5eef07d3510..00000000000 --- a/website/docs/faqs/Environments/beta-release.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: What is a beta release? -description: "How to try out beta features" -sidebar_label: 'What is a beta release?' -id: beta-release ---- -This is a chance to try out brand-new functionality. You get to start planning for use cases that the next minor version will unlock. We get to hear from you about unexpected behavior and nasty bugs, so that the release candidate has more polish and fewer surprises. diff --git a/website/docs/faqs/Environments/delete-environment-job.md b/website/docs/faqs/Environments/delete-environment-job.md index b649769f070..eb9ac511a7c 100644 --- a/website/docs/faqs/Environments/delete-environment-job.md +++ b/website/docs/faqs/Environments/delete-environment-job.md @@ -8,16 +8,7 @@ id: delete-environment-job To delete an environment or job in dbt Cloud, you must have a `developer` [license](/docs/cloud/manage-access/seats-and-users) and have the necessary [access permissions](/docs/cloud/manage-access/about-user-access). -:::info 📌 Delete a job first before deleting environment - -Deleting an environment doesn't automatically delete its associated job(s). If you delete an environment first without deleting the job, you won't be able to delete the job since it's without an environment. - -To completely delete your environment, you _must_: -1. First delete all jobs associated with that environment, -2. Then, delete the environment. -::: - -**Delete a job** +## Delete a job To delete a job or multiple jobs in dbt Cloud: @@ -33,11 +24,11 @@ To delete a job or multiple jobs in dbt Cloud: 5. Confirm your action in the **Confirm Delete** pop-up by clicking **Confirm Delete** in the bottom right to delete the job immediately. This action cannot be undone. However, you can create a new job with the same information if the deletion was made in error. -Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each individual job. +Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each job. -**Delete an environment** +## Delete an environment -To delete an environment in dbt Cloud: +Deleting an environment automatically deletes its associated job(s). If you want to keep those jobs, move them to a different environment first. To delete an environment in dbt Cloud: 1. Click **Deploy** on the navigation header and then click **Environments** 2. Select the Environment you want to delete. @@ -54,4 +45,4 @@ To delete an environment in dbt Cloud: Refresh your page, and the deleted environment should now be gone. If you want to delete multiple environments, you'll need to perform these steps to delete each one. -If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help. \ No newline at end of file +If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help. diff --git a/website/docs/faqs/Git/git-migration.md b/website/docs/faqs/Git/git-migration.md new file mode 100644 index 00000000000..775ae3679e3 --- /dev/null +++ b/website/docs/faqs/Git/git-migration.md @@ -0,0 +1,26 @@ +--- +title: "How to migrate git providers" +sidebar_label: "How to migrate git providers" +id: "git-migration" +hide_table_of_contents: true +description: "Learn how to migrate git providers in dbt Cloud with minimal disruption." +tags: [Git] +--- + +To migrate from one git provider to another, refer to the following steps to avoid minimal disruption: + +1. Outside of dbt Cloud, you'll need to import your existing repository into your new provider. + + As an example, if you're migrating from GitHub to Azure DevOps, you'll need to import your existing repository (GitHub) into your new git provider (Azure DevOps). For detailed steps on how to do this, refer to your git provider's documentation (Such as [GitHub](https://docs.github.com/en/migrations/importing-source-code/using-github-importer/importing-a-repository-with-github-importer), [GitLab](https://docs.gitlab.com/ee/user/project/import/repo_by_url.html), [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/repos/git/import-git-repository?view=azure-devops)) + +2. Go back to dbt Cloud and set up your [integration for the new git provider](/docs/cloud/git/connect-github), if needed. +3. Disconnect the old repository in dbt Cloud by going to **Account Settings** and then **Projects**. Click on the **Repository** link, then click **Edit** and **Disconnect**. + + + +4. On the same page, connect to the new git provider repository by clicking **Configure Repository** + - If you're using the native integration, you may need to OAuth to it. + +5. That's it, you should now be connected to the new git provider! 🎉 + +Note — As a tip, we recommend you refresh your page and dbt Cloud IDE before performing any actions. diff --git a/website/docs/faqs/Models/available-materializations.md b/website/docs/faqs/Models/available-materializations.md index 25ba745a2b2..011d3ba3fb0 100644 --- a/website/docs/faqs/Models/available-materializations.md +++ b/website/docs/faqs/Models/available-materializations.md @@ -5,6 +5,7 @@ sidebar_label: 'Materializations available' id: available-materializations --- -dbt ships with four materializations: `view`, `table`, `incremental` and `ephemeral`. Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options. +dbt ships with five materializations: `view`, `table`, `incremental`, `ephemeral` and `materialized_view`. +Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options. You can also create your own [custom materializations](/guides/advanced/creating-new-materializations), if required however this is an advanced feature of dbt. diff --git a/website/docs/faqs/Models/configurable-model-path.md b/website/docs/faqs/Models/configurable-model-path.md index 6e8861a0693..c34112a5fe1 100644 --- a/website/docs/faqs/Models/configurable-model-path.md +++ b/website/docs/faqs/Models/configurable-model-path.md @@ -6,12 +6,6 @@ id: configurable-model-path --- - - -- **v1.0.0:** The config 'source-path' has been deprecated in favor of [`model-paths`](/reference/project-configs/model-paths). - - - By default, dbt expects the files defining your models to be located in the `models` subdirectory of your project. To change this, update the [model-paths](reference/project-configs/model-paths.md) configuration in your `dbt_project.yml` diff --git a/website/docs/faqs/Tests/configurable-data-path.md b/website/docs/faqs/Tests/configurable-data-path.md index 7c4e92f7226..7663d2d3f11 100644 --- a/website/docs/faqs/Tests/configurable-data-path.md +++ b/website/docs/faqs/Tests/configurable-data-path.md @@ -6,12 +6,6 @@ id: configurable-data-path --- - - -- **v1.0.0:** The config 'data-paths' has been deprecated in favor of [`seed-paths`](/reference/project-configs/seed-paths). - - - By default, dbt expects your seed files to be located in the `seeds` subdirectory of your project. diff --git a/website/docs/faqs/Tests/testing-seeds.md b/website/docs/faqs/Tests/testing-seeds.md index 93afcab2fa4..3b1b3e0df56 100644 --- a/website/docs/faqs/Tests/testing-seeds.md +++ b/website/docs/faqs/Tests/testing-seeds.md @@ -6,8 +6,6 @@ id: testing-seeds --- -The `seeds:` key is new in 0.16.0. Prior to this, use a `models:` key instead. - To test and document seeds, use a [schema file](/reference/configs-and-properties) and nest the configurations under a `seeds:` key ## Example diff --git a/website/docs/guides/best-practices/custom-generic-tests.md b/website/docs/guides/best-practices/custom-generic-tests.md index dc23770423e..f2d84e38853 100644 --- a/website/docs/guides/best-practices/custom-generic-tests.md +++ b/website/docs/guides/best-practices/custom-generic-tests.md @@ -6,13 +6,6 @@ displayText: Writing custom generic tests hoverSnippet: Learn how to define your own custom generic tests. --- - - -* `v0.20.0`: Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. They return a number of failing rows, rather than a single numeric value. -* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory - - - dbt ships with [Not Null](/reference/resource-properties/tests#not-null), [Unique](/reference/resource-properties/tests#unique), [Relationships](/reference/resource-properties/tests#relationships), and [Accepted Values](/reference/resource-properties/tests#accepted-values) generic tests. (These used to be called "schema tests," and you'll still see that name in some places.) Under the hood, these generic tests are defined as `test` blocks (like macros) in a globally accessible dbt project. You can find the source code for these tests in the [global project](https://github.com/dbt-labs/dbt-core/tree/main/core/dbt/include/global_project/macros/generic_test_sql). :::info diff --git a/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md b/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md deleted file mode 100644 index 17811b14ca3..00000000000 --- a/website/docs/guides/best-practices/environment-setup/1-env-guide-overview.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "dbt Cloud environment best practices" -id: 1-env-guide-overview -description: Learn how to configure environments in dbt Cloud. -displayText: "dbt Cloud environment best practices" -hoverSnippet: Learn how to configure environments in dbt Cloud. ---- - -> *How do I manage environments in my dbt Cloud project? How many do I need?* -> -> *How does my structure map to environments in dbt Cloud?* -> -> *What do git branches have to do with my dbt Cloud environments?* -> - -If these questions keep you up at night, you’ve come to the right place! When it comes to managing your dbt Cloud environments, there is not a one-size-fits-all solution for all teams. In this guide we’ll walk you through a few environment architecture options for dbt Cloud that we’d recommend, and hopefully you find an option that works for you. - -## Learning goals - -This guide has three main goals: - -- Provide our recommendations on managing dbt Cloud environments -- Illustrate these recommendations with comprehensive examples -- At each stage, explain *why* we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs - -:::info -☁️ This guide focuses on architecture for **dbt Cloud**. However, similar principles apply for developers using dbt Core. Before diving into this guide we recommend taking a look at our **[dbt Cloud environments](/docs/dbt-cloud-environments)** page for more context. - -::: - -### How many environments do I really need? - -Environments define the way that dbt will execute your code, including: - -- The **version of dbt** that will run. -- The **version of your code** to be executed. -- The **connection information** for your warehouse. -- In dbt Cloud, there are **two types of environments:** - - **Development** — the environment settings in which you work in the IDE on a development branch. - - **Deployment** — the environment settings in which a dbt Cloud job runs. - -In this guide, we’re going to focus on **deployment environments**, which determine how your project is executed when a **dbt Cloud job executes**. - -Depending on your git workflow and testing strategy, you'll be choosing between one deployment environment or many deployment environments. We provide a high-level overview of how these two deployment strategies work here, but use each section of this guide to get a deep-dive into how these setups differ. - -| Setup option | Works well if you | Relative complexity level | -| --- | --- | --- | -| One deployment environment | - only scheduled runs for one set of data objects
      - development branches are merged directly to main | Low | -| Many deployment environments | - feature branches move through several promotion stages | High | - -### TL;DR — One deployment environment - -We usually recommended folks start with the basics; having one deployment environment is usually the simplest and most maintainable approach to start. This approach works well if: - -- You only need to have **scheduled jobs running in a single environment** within your data warehouse. -- You use a **single primary branch** and follow a direct promotion (**Dev —> Prod**) strategy - -With this option, your production jobs and your [Slim CI jobs](/docs/deploy/continuous-integration) that ensure code integrity are managed within one single deployment environment. - -### TL;DR — Many deployment environments -This approach adds a bit more complexity and may slow down the development process, but adds a layer of security that can be worth the tradeoff. This approach works well if: - -- Your organization maintains **several long-lived git branches** to control how and when changes are tested and promoted to production. - - Some orgs follow a **Dev —> QA —> Prod release cycle** — if that sounds like your org, this approach is probably right for you. -- The **output of your dbt project is an input to other systems** and you need to test and validate many changes on a stable, long-lived staging dataset in a pre-production environment. - -The two options are explored in more detail in the following sections, including the benefits, trade-offs, the steps required to implement the setup in dbt Cloud. diff --git a/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md b/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md deleted file mode 100644 index d7d64eda548..00000000000 --- a/website/docs/guides/best-practices/environment-setup/2-one-deployment-environment.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "One deployment environment" -id: 2-one-deployment-environment -description: Learn how to configure a single deployment environment setup in dbt Cloud. -displayText: "dbt Cloud environment best practices" -hoverSnippet: Learn how to configure a single deployment environment setup in dbt Cloud. ---- - - - -## What this looks like - -1. You have a **single *development* environment** where dbt users can access the dbt Cloud IDE and make changes to their code on feature branches created off of your default branch in your repository (most often the `main` branch). -2. You have a **single *deployment* environment** (let’s call it “Production”) where your scheduled jobs run referencing the `main` branch. -3. You also have a [**Slim CI job**](/docs/deploy/continuous-integration) that kicks off anytime you open a PR to merge a feature branch into `main`. This Slim CI job can run in your dbt “Production” environment. - -:::info - -☁️ Slim CI jobs run in a dedicated custom schema for each PR, so there will no collision with your production schemas. - -::: - - - -### Git workflow - - - - -1. In the dbt Cloud IDE, developers work on feature branches, created from the `main` branch (`feature_a`, `feature_b`, `feature_c` above) -2. When code is ready, developer opens a PR to merge feature branch into `main` -3. [**Slim CI Job**](/docs/deploy/continuous-integration) automatically kicks off, and tests the changes made in the PR -4. When Slim CI Job is successful and team is ready to deploy changes to Production, the PR is merged directly into the `main` branch. The next time a production job runs, these changes will be incorporated and executed. - -### dbt Cloud setup - -1. Create your [**development environment**](/docs/dbt-cloud-environments) to power the dbt Cloud IDE. No extra customization needed! -2. Create your **[production deployment environment](/docs/deploy/deploy-environments)**. -3. Define your **dbt Cloud jobs** in the production deployment environment from step 2. - 1. **Production job(s)**: You will need to set up **at least one scheduled job** that deploys your project to your production databases/schemas. You may create multiple jobs based on your business SLAs. - 2. **Slim CI Job**: Unlike the production jobs, which are triggered via the scheduler, this job will be triggered when PRs are opened in your repository. Refer to [Slim CI jobs](/docs/deploy/slim-ci-jobs) for details. - - -### When this works well - -This approach is recommended for most use cases because it enables you to quickly and safely implement code changes in the production environment. It also gives developers the confidence to trust and rely on these changes. With this option, multiple developers can easily contribute to and collaborate on the same codebase with confidence. - -:::info -💡 Check out [Sunrun's Coalesce 2022 talk](https://www.youtube.com/watch?v=vmBAO2XN-fM) on Automating CI/CD in dbt Cloud, where they simplified their CI/CD process from several long-lived branches to a single long-lived main branch with feature branches. - -::: - -### When this doesn’t work so well - -- You have a **formal QA process** before merging code into production. -- You want to **control when features are released** to production. -- You need to have scheduled **jobs running in many environments** due to dependencies on outside systems. - - e.g. Your organization has many applications that consume and test data changes in a lower non-Production environment before changes should be promoted to Production. diff --git a/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md b/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md deleted file mode 100644 index cf9f6954ca7..00000000000 --- a/website/docs/guides/best-practices/environment-setup/3-many-deployment-environments.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: "Many deployment environments" -id: 3-many-deployment-environments -description: Learn how to configure a many deployment environment setup in dbt Cloud. -displayText: "dbt Cloud environment best practices" -hoverSnippet: Learn how to configure a many deployment environment setup in dbt Cloud. ---- - - -## What this looks like - -1. You have a **single *development* environment** where dbt users can access the dbt Cloud IDE and make changes to their code. However, you’ll want to update the **[custom branch settings](faqs/Environments/custom-branch-settings)** to ensure that developers create feature branches off of the a non-production branch. For this example, we’ll refer to this as the `qa` branch. -2. You have a **QA deployment environment**, running scheduled jobs from the `qa` branch that deploys your dbt project to a pre-production warehouse location. -3. You have a **Production deployment environment,** running scheduled jobs from the `main` branch that deploys your dbt project to your production warehouse location. -4. You have **multiple Slim CI jobs** (one in each deployment environment) to ensure changes to each branch are tested. - - - -### Git workflow - - - -1. In the dbt Cloud IDE, developers work on feature branches, **created from the `qa` branch** (`feature_a`, `feature_b`, `feature_c` above). -2. When code is ready, developer opens a PR to merge feature branch into `qa`. -3. The **first Slim CI Job** automatically kicks off to test the changes introduced in the PR. This job will *defer to a regularly-scheduled job in the QA environment* and run in the QA deployment environment. -4. When **Slim CI Job is successful** and team is ready to deploy changes, the **PR is merged into `qa`.** -5. Scheduled jobs run in the QA deployment environment, running on `qa` branch to ensure the new changes work as intended. -6. When **all feature branches** for a given release (e.g. sprint) have been **successfully merged** to `qa` and are **running without error** in the QA deployment environment, a team member opens a **PR to merge `qa` → `main`.** -7. The **second Slim CI Job** automatically kicks off to test changes in PR. This job will *defer to a regularly-scheduled job in the Production environment* and run in the Production deployment environment. -8. When **second Slim CI Job** is successful and team is ready to deploy changes, the **PR is merged into `main`**. -9. Monitor scheduled jobs in the Production deployment environment that are running on `main` branch. Voila! All changes are released and ready for your stakeholders. - -:::info -💡 Considering a different branching strategy that involves cherry picking? [Maybe reconsider!](https://docs.getdbt.com/blog/the-case-against-git-cherry-picking) - -::: - -### dbt Cloud setup - -1. Create your [**development environment**](/docs/dbt-cloud-environments) to power the dbt Cloud IDE. - - Here, we’ll set a **custom branch** so that users in the IDE create their feature branches from `qa` instead of `main`. Click **Only run on a custom branch** in **General settings**, enter `qa` into **Custom Branch.** - -2. Set up your **QA [deployment environment](/docs/deploy/deploy-environments)** - - Here, we’ll apply the same custom branch settings as the development environment in Step 1. All scheduled jobs in the QA deployment environment will use the code from the `qa` branch during execution. - -3. **Define QA jobs** - 1. **QA job(s)**: You’ll want to create at least one scheduled job, running on a roughly daily cadence. This will allow us to make sure all the code executes without error before you release it to production, and will also power the first Slim CI job. - 2. **Slim CI Job**: As above, this job will be triggered when PRs are opened in your repository. Enable this option by selecting **Run on Pull Requests?** under the **Continuous Integration(CI)** tab under the **Triggers** section. Since we’re using the custom branch setting in the QA environment, you'll also want to be sure to select the second option **Run only on Custom Branch** (selected by default) — this means that only PRs created against the `qa` branch will trigger this job, rather than any PR at all. - - This job will also need to defer to one of the QA jobs created in step 3a. This enables the use of the `state` modifier in your selection syntax to only run changes introduced by your PR. - -4. Set up your **Production [deployment environment](/docs/deploy/deploy-environments)** - - Here, we’ll *also* use the same custom branch settings as the other environments, but set the custom branch as `main`. Even thought the `main` branch is the default, setting this value enables us to properly set up the CI Job in the next step. - -5. **Define production jobs** - 1. **Production job(s)**: You will need to set up at least one scheduled job that deploys your project to your production databases/schemas. You may create multiple jobs based on your business SLAs. - 2. **Production Slim CI Job**: As above, this job will be triggered when PRs are opened in your repository. Enable this option by selecting **Run on Pull Requests?** under the **Continuous Integration(CI)** tab under the **Triggers** section. Since we’re using the custom branch setting in the QA environment, we’ll also want to select the second option **Run only on Custom Branch** — this means that only PRs created against the `main` branch will trigger this job, rather than any PR at all. - - This job will also need to defer to one of the QA jobs created in step 5a. This enables the use of the `state` modifier in your selection syntax to only run changes introduced by your PR. - -### When this works well - -This approach works well when it’s critical to **apply user acceptance and integration testing to your project in a pre-production environment**. This approach allows you to have scheduled jobs running in **many environments** on your data warehouse. - -### When this doesn’t work so well - -This approach may slow down the time it takes to get new feature into production, since it requires additional steps in the deployment process and additional branches to maintain. Keep in mind that adding complexity to your deployment process might cause some slowdown in your release cycle. - -## Conclusion - -While there’s no single correct answer to how to setup your dbt Cloud environments, they are flexible enough to enable just about any code promotion workflow your organization uses. We would love to hear how you’ve set up your deployment infrastructure in dbt Cloud! diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md new file mode 100644 index 00000000000..19c6717063c --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md @@ -0,0 +1,38 @@ +--- +title: "Intro to MetricFlow" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +:::tip +**This is a guide for a beta product.** We anticipate this guide will evolve alongside the Semantic Layer through community collaboration. We welcome discussions, ideas, issues, and contributions to refining best practices. +::: + +Flying cars, hoverboards, and true self-service analytics: this is the future we were promised. The first two might still be a few years out, but real self-service analytics is here today. With dbt Cloud's Semantic Layer, you can resolve the tension between accuracy and flexibility that has hampered analytics tools for years, empowering everybody in your organization to explore a shared reality of metrics. Best of all for analytics engineers, building with these new tools will significantly [DRY](https://docs.getdbt.com/terms/dry) up and simplify your codebase. As you'll see, the deep interaction between your dbt models and the Semantic Layer make your dbt project the ideal place to craft your metrics. + +## Learning goals + +- ❓ Understand the **purpose and capabilities** of the **dbt Semantic Layer**, particularly MetricFlow as the engine that powers it. +- 🧱 Familiarity with the core components of MetricFlow — **semantic models and metrics** — and how they work together. +- 🛠️ Hands-on **experience building** semantic models and metrics in dbt Cloud. +- 🔁 Know how to **refactor** models for MetricFlow. +- 🏅 Aware of new **best practices** to take maximum advantage of the Semantic Layer. + +## Guide structure overview + +We'll work through our learning goals via an [example project](https://github.com/dbt-labs/jaffle-sl-template), we encourage you to follow along and try the code out for yourself if you'd like on the `start-here` branch, or you can just follow along with the completed state of the codebase on the `main` branch. + +1. Getting **setup** with MetricFlow in your dbt project. +2. Building your first **semantic model** and its fundamental parts: **entities, dimensions, and measures**. +3. Building your first **metric**. +4. **Refactoring** a mart into the Semantic Layer. +5. Defining **advanced metrics**: `ratio` and `derived` types. +6. Review **best practices**. + +If you're ready to ship your users more power with less code, let's dive in! + +:::info +MetricFlow is a new way to define metrics in dbt and one of the key components of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). It handles SQL query construction and defines the specification for dbt semantic models and metrics. + +To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). +::: diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md new file mode 100644 index 00000000000..34c0e813725 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md @@ -0,0 +1,43 @@ +--- +title: "Set up MetricFlow" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## Getting started + +First, if you want to follow along, we'll need to clone the [example project](https://github.com/dbt-labs/jaffle-sl-template). You will need access to a Snowflake, BigQuery, Databricks, or Postgres warehouse for this, for the time being. The project is our classic Jaffle Shop, a simulated chain restaurant serving [jaffles](https://en.wikipedia.org/wiki/Pie_iron) and tasty beverages. + +```shell +git clone git@github.com:dbt-labs/jaffle-sl-template.git +cd path/to/project +``` + +Next before we start writing code, we'll need to install the MetricFlow CLI as an extension of a dbt adapter from PyPI. The MetricFlow CLI is compatible with Python versions 3.8 through 3.11. + +We'll use pip to install MetricFlow and our dbt adapter: + +```shell +# activate a virtual environment for your project, +# if you don't have a name you like to use we suggest .venv +python -m venv [virtual environment name] +source [virtual environment name]/bin/activate +# install dbt and MetricFlow +pip install "dbt-metricflow[adapter name]" +# e.g. pip install "dbt-metricflow[snowflake]" +``` + +Lastly, to get to the pre-Semantic Layer starting state, checkout the `start-here` branch. + +```shell +git checkout start-here +``` + +For more information you can [look at the docs](/docs/build/metricflow-cli) or checkout a [Quickstart](https://docs.getdbt.com/quickstarts) to get more familiar with setting up a dbt project. + +## Basic commands + +- 💻 This package will install both `dbt` and `mf` as CLIs in our virtual environment. All the regular `dbt` commands like `run`, `build`, and `test` are available. +- 🔍 A less common one that will come in handy with the Semantic Layer is `dbt parse`. This will parse your project and generate a **semantic manifest**, a representation of meaningful connections described by your project. This file gives MetricFlow a **state of the world from which to generate queries**. +- 🧰 In addition to `dbt`, you'll have access to `mf` commands like `query` and `validate-configs`, which operate based on that semantic manifest. We'll dig more into all of these as we go along. +- 🛠️ Lets start off by running a `dbt build` to get the **starting state** of our project built. diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md new file mode 100644 index 00000000000..a2dc55e37ae --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md @@ -0,0 +1,296 @@ +--- +title: "Building semantic models" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## How to build a semantic model + +A semantic model is the MetricFlow equivalent to a logical layer model (what historically has just been called a 'model' in dbt land). Just as configurations for models are defined on the `models:` YAML key, configurations for semantic models are housed under `semantic models:`. A key difference is that while a logical model consists of configuration and SQL or Python code, a **semantic model is defined purely via YAML**. Rather than encoding a specific dataset, a **semantic model describes relationships** that let your end users select and refine their own datasets reliably. + +- ⚙️ Semantic models are **comprised of three components**: + - 🫂 **entities**: these describe the **relationships** between various semantic models (think ids) + - 🪣 **dimensions**: these are the columns you want to **slice, dice, group, and filter by** (think timestamps, categories, booleans). + - 📏 **measures**: these are the **quantitative values you want to aggregate** +- 📚 We define **columns as being an entity, dimension, or measure**. + +:::tip +**File per model**. Given the interdependence of logical and semantic models, and semantic models and metrics, we've updated our best practice recommendation to a one YAML file per model approach if you're using the Semantic Layer. This houses everything related to a model in one place and preserves unique file names for quickly getting to the code you want. +::: + +## Defining orders + +- 🥪 The semantic model we're going to define is _orders_. +- 📗 We define it as a **YAML dictionary in the semantic models list**. +- 📑 It will have a **name, entities list, dimensions list, and measures list**. +- ⏬ We recommend defining them **in this order consistently** as a style best practice. + +```YAML +semantic_models: + - name: orders + entities: + ... + dimensions: + ... + measures: + ... +``` + +- Next we'll point to the corresponding logical model by supplying a [`ref`](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) in the `model:` property, and a `description` for documentation. + +```YAML +semantic_models: + - name: orders + description: | + Model containing order data. The grain of the table is the order id. + model: ref('stg_orders') + entities: + ... + dimensions: + ... + measures: + ... +``` + +## Establishing our entities + +- 🫂 Entities are the **objects and concepts** in our data that _have_ dimensions and measures. You can think of them as the **nouns** of our project, the **spines** of our queries that we may want to aggregate by, or simply the **join keys**. +- 🔀 Entities help MetricFlow understand **how various semantic models relate to one another**. +- ⛓️ Unlike many other semantic layers, in MetricFlow **we do not need to describe joins explicitly**, instead the **relationships are implicitly described by entities**. +- 1️⃣ Each semantic model should have **one primary entity** defined for itself, and **any number of foreign entities** for other semantic models it may join to. +- 🫂 Entities require a **name and type** + - 🔑 Types available are **primary**, **foreign**, **unique** or **natural** — we'll be focused on the first two for now, but you can [read more about unique and natural keys](https://docs.getdbt.com/docs/build/entities#entity-types). + +### Entities in action + +If we look at the staging model for orders, we see that it has 3 id columns, so we'll need three entities. + +```SQL +renamed as ( + + select + + ---------- ids + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- properties + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps + ordered_at + + from source +``` + +- 👉 We add them with a **`name`, `type`, and optional `expr`** (expression). The expression can be any valid SQL expression on your platform. +- 📛 If you **don't add an expression**, MetricFlow will **assume the name is equal to the column name** in the underlying logical model. +- 👍 Our best practices pattern is to, whenever possible, provide a `name` that is the singular form of the subject or grain of the table, and use `expr` to specify the precise column name (with `_id` etc). This will let us write **more readable metrics** on top of these semantic models. + +```YAML +semantic_models: + - name: orders + ... + entities: + # we use the column for the name here because order is a reserved word in SQL + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + ... + measures: + ... + +``` + +## Defining our dimensions + +- 🧮 Dimensions are the columns that we want to **filter and group by**, **the adjectives of our project**. They come in three types: + - **categorical** + - **time** + - slowly changing dimensions — [these are covered in the documentation](https://docs.getdbt.com/docs/build/dimensions#scd-type-ii), and a little more complex. To focus on building your mental models of MetricFlow's fundamentals, we won't be using SCDs in this guide. +- ➕ We're **not limited to existing columns**, we can use the `expr` property to add simple computations in our dimensions. +- 📛 Categorical dimensions are the simplest, they simply require a `name` and `type` (type being categorical). **If the `name` property matches the name of the dimension column**, that's it, you're done. If you want or need to use a `name` other than the column name, or do some filtering or computation, **you can supply an optional `expr` property** to evaluate for the dimension. + +### Dimensions in action + +- 👀 Let's look at our staging model again and see what fields we have available. + +```SQL +select + + ---------- ids -> entities + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- numerics -> measures + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps -> dimensions + ordered_at + +from source +``` + +- ⏰ For now the only dimension to add is a **time dimension**. +- 🕰️ At least one **primary time dimension** is **required** for any semantic models that **have measures**. +- 1️⃣ We denote this with the `is_primary` property, or if there is only a one-time dimension supplied it is primary by default. Below we only have `ordered_at` as a timestamp so we don't need to specify anything except the maximum granularity we're bucketing to (in this case, day). + +```YAML +dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using [BigQuery](/docs/build/dimensions#time) + type: time + type_params: + time_granularity: day +``` + +:::tip +**Dimensional models**. You may have some models that do not contain measures, just dimensional data that enriches other facts. That's totally fine, a semantic model does not require dimensions or measures, it just needs a primary entity, and if you do have measures, a primary time dimension. + +We'll discuss an alternate situation, dimensional tables that have static numeric values like supply costs or tax rates but no time dimensions, later in the Guide. +::: + +- 🔢 We can also **make a dimension out of a numeric column** that would typically be a measure. +- 🪣 Using `expr` we can **create buckets of values that we label** for our dimension. We'll add one of these in for labeling 'large orders' as any order totals over $50. + +```YAML +... +dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end +... +``` + +## Making our measures + +- 📏 Measures are the final component of a semantic model. They describe the **numeric values that we want to aggregate**. +- 🧱 Measures form **the building blocks of metrics**, with entities and dimensions helping us combine, group, and filter those metrics correctly. +- 🏃 You can think of them as something like the **verbs of a semantic model**. + +### Measures in action + +- 👀 Let's look at **our staging model** one last time and see what **fields we want to measure**. + +```SQL +select + + ---------- ids -> entities + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- numerics -> measures + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps -> dimensions + ordered_at + +from source +``` + +- ➕ Here `order_total` and `tax paid` are the **columns we want as measures**. +- 📝 We can describe them via the code below, specifying a **name, description, aggregation, and expression**. +- 👍 As before MetricFlow we default to the **name being the name of a column when no expression is supplied**. +- 🧮 [Many different aggregations](https://docs.getdbt.com/docs/build/measures#aggregation) are available to us. Here we just want sums. + +```YAML +measures: + - name: order_total + description: The total amount for each order including taxes. + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + +- 🆕 We can also **create new measures using expressions**, for instance adding a count of individual orders as below. + +```YAML + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum +``` + +## Validating configs + +Our completed code should look like this, our first semantic model! + +```orders +semantic_models: + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + + model: ref('stg_orders') + + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end + + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + +- 🦺 We can check that it's a valid configuration and works with the real data our dbt project is generating by using the `mf validate-configs` command. This will: + 1. **Parse the semantic manifest** our configuration describes out of the dbt project. + 2. Validate the **internal semantics** of the manifest as described by our code. + 3. Validate the **external semantics** of the manifest against your data warehouse (e.g. making sure that a column specified as a dimension exists on the proper table) + +## Review and next steps + +Let's review the basics of semantic models: + +- 🧱 Consist off **entities, dimensions, and measures**. +- 🫂 Describe the **semantics and relationships of objects** in the warehouse. +- 1️⃣ Correspond to a **single logical model** in your dbt project. + +Next up, let's use our new semantic model to **build a metric**! diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md new file mode 100644 index 00000000000..cd0efdc9e64 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md @@ -0,0 +1,41 @@ +--- +title: "Building metrics" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## How to build metrics + +- 💹 We'll start with one of the most important metrics for any business: **revenue**. +- 📖 For now, our metric for revenue will be **defined as the sum of order totals excluding tax**. +- 🆕 Let's create a file called `metrics.yml` in our marts folder for now to write our first metric in. + +## Defining revenue + +- 🔢 Metrics have four basic properties: + - `name:` We'll use 'revenue' to reference this metric. + - `description:` For documentation. + - `label:` The display name for the metric in downstream tools. + - `type:` one of `simple`, `ratio`, or `derived`. +- 🎛️ Each type has different `type_params`. +- 🛠️ We'll build a **simple metric** first to get the hang of it, and move on to ratio and derived metrics later. +- 📏 Simple metrics are built on a **single measure defined as a type parameter**. +- 🔜 Defining **measures as their own distinct component** on semantic models is critical to allowing the **flexibility of more advanced metrics**, though simple metrics act mainly as **pass-through that provide filtering** and labeling options. A `create_metric` option for measures is coming in the next version of MetricFlow to **save you writing extra code** for simple metrics that make no changes to the underlying measure. + +```YAML +metrics: + - name: revenue + description: Sum of the order total. + label: Revenue + type: simple + type_params: + measure: order_total +``` + +## Query your metric + +- It's best practice any time we're updating our semantic layer code to run a `dbt parse && mf validate-configs`. +- If everything passes, we can start querying this metric with `mf query`! +- `mf query` is not how you would use the tool in production, that's handled by the dbt Cloud Semantic Layer's features. It's available for testing results of various metric queries in development, exactly as we're using it now. +- Try `mf query --metrics revenue --group-by metric_time__day` and see a preview of the data come back. +- Note the structure of the above query. We select the metric(s) we want and the dimensions to group them by — we use dunders (double underscores e.g.`metric_time__[time bucket]`) to designate time dimensions or other non-unique dimensions that need a specified entity path to resolve (e.g. if you have a orders location dimension and a employee location dimension both named 'location' you would need dunders to specify `orders__location` or `employee__location`). diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md new file mode 100644 index 00000000000..b2efb39e9fc --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md @@ -0,0 +1,242 @@ +--- +title: "Refactor an existing mart" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## A new approach + +We've covered the basics, now it's time to dig in to the fun and messy part: how do we refactor an existing mart in dbt into semantic models and metrics? + +Let's look at the differences we can observe in how we might approach this with MetricFlow supercharging dbt versus how we work without a Semantic Layer. These differences can then inform our structure. + +- 🍊 In dbt, we tend to create **highly denormalized datasets** that bring **everything you want around a certain entity or process into a single table**. +- 💜 The problem is, this **limits the dimensionality available to MetricFlow**. The more we pre-compute and 'freeze' into place, the less flexible our data is. +- 🚰 In MetricFlow, we ideally want **highly normalized**, star schema-like data that then allows MetricFlow to shine as a **denormalization engine**. +- ∞ Another way to think about this is that instead of moving down a list of requested priorities trying to pre-make as many combinations of our marts as possible — increasing lines of code and complexity — we can **let MetricFlow present every combination possible without specifically coding it**. +- 🏗️ To resolve these approaches optimally, we'll need to shift some **fundamental aspects of our modeling strategy**. + +## Refactor steps outlined + +We recommend an incremental implementation process that looks something like this: + +1. 👉 Identify **an important output** (a revenue chart on a dashboard for example, and the mart model(s) that supplies this output. +2. 🔍 Examine all the **entities that are components** of this mart (for instance, an orders mart may include customers, shipping, and product data). +3. 🛠️ **Build semantic models and metrics** for all the required components. +4. 👯 Create a **clone of the output** on top of the Semantic Layer. +5. 💻 Audit to **ensure you get accurate outputs**. +6. 💎 Use `mf list dimensions --metrics [metric_name]` to check that your refactoring is increasing dimensionality (flexibility). +7. 👉 Identify **any other outputs** that point to the mart and **move them to the Semantic Layer**. +8. ✌️ Put a **deprecation plan** in place for the mart. + +You would then **continue this process** on other outputs and marts moving down a list of **priorities**. Each model as you go along will be faster and easier as you'll **reuse many of the same components** that will already have been semantically modeled. + +## Let's make a `revenue` metric + +So far we've been working in new pointing at a staging model to simplify things as we build new mental models for MetricFlow. In reality, unless you're implementing MetricFlow in a green-field dbt project, you probably are going to have some refactoring to do. So let's get into that in detail. + +1. 📚 Per the above steps, we've identified our target, now we need to identify all the components we need, these will be all the 'import' CTEs at the top our mart. Let's look at `orders` and `order_items`, the likely models to generate revenue, we see we'll need: `orders`, `order_items`, `products`, `locations`, and `supplies`. +2. 🗺️ We'll next make semantic models for all of these. Let's walk through a straightforward conversion first with `locations`. +3. ⛓️ We'll want to first decide if we need to do any joining to get this into the shape we want for our semantic model. The biggest determinants of this are two factors: + - 📏 Does this semantic model **contain measures**? + - 🕥 Does this semantic model have a **primary timestamp**? + - 🫂 If a semantic model **has measures but no timestamp** (for example, supplies in the example project, which has static costs of supplies), you'll likely want to **sacrifice some normalization and join it on to another model** that has a primary timestamp to allow for metric aggregation. +4. 🔄 If we _don't_ need any joins, we'll just go straight to the staging model for our semantic model's `ref`. Locations does have a `tax_rate` measure, but it also has an `ordered_at` timestamp, so we can go **straight to the staging model** here. +5. 🥇 We specify our **primary entity** (based on `location_id`), dimensions (one categorical, `location_name`, and one **primary time dimension** `opened_at`), and lastly our measures, in this case just `average_tax_rate`. + + ```YAML + semantic_models: + - name: locations + description: | + Location dimension table. The grain of the table is one row per location. + model: ref('stg_locations') + entities: + - name: location + type: primary + expr: location_id + dimensions: + - name: location_name + type: categorical + - name: date_trunc('day', opened_at) + type: time + type_params: + time_granularity: day + measures: + - name: average_tax_rate + description: Average tax rate. + expr: tax_rate + agg: avg + ``` + +## Semantic and logical interaction + +Now, let's tackle a thornier situation. Products and supplies both have dimensions and measures but no time dimension. Products has a one-to-one relationship with `order_items`, enriching that table, which is itself just a mapping table of products to orders. Additionally, products have a one-to-many relationship with supplies. The high-level ERD looks like the diagram below. + + + +So to calculate, for instance, the cost of ingredients and supplies for a given order, we'll need to do some joining and aggregating, but again we **lack a time dimension for products and supplies**. This is the signal to us that we'll **need to build a logical mart** and point our semantic model at that. + +:::tip +**dbt 🧡 MetricFlow.** This is where integrating your semantic definitions into your dbt project really starts to pay dividends. The interaction between the logical and semantic layers is so dynamic, you either need to house them in one codebase or facilitate a lot of cross-project communication and dependency. +::: + +1. 🎯 Let's aim at, to start, building a table at the `order_items` grain. We can aggregate supply costs up, map over the fields we want from products, such as price, and bring the `ordered_at` timestamp we need over from the orders table. We'll write the following code in `models/marts/order_items.sql`. + + ```SQL + {{ + config( + materialized = 'table', + ) + }} + + with + + order_items as ( + + select * from {{ ref('stg_order_items') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders')}} + + ), + + products as ( + + select * from {{ ref('stg_products') }} + + ), + + supplies as ( + + select * from {{ ref('stg_supplies') }} + + ), + + order_supplies_summary as ( + + select + product_id, + sum(supply_cost) as supply_cost + + from supplies + + group by 1 + ), + + joined as ( + + select + order_items.*, + products.product_price, + order_supplies_summary.supply_cost, + products.is_food_item, + products.is_drink_item, + orders.ordered_at + + from order_items + + left join orders on order_items.order_id = orders.order_id + + left join products on order_items.product_id = products.product_id + + left join order_supplies_summary on order_items.product_id = order_supplies_summary.product_id + + ) + + select * from joined + ``` + +2. 🏗️ Now we've got a table that looks more like what we want to feed into MetricFlow. Next, we'll **build a semantic model on top of this new mart** in `models/marts/order_items.yml`. Again, we'll identify our **entities, then dimensions, then measures**. + + ```YAML + semantic_models: + #The name of the semantic model. + - name: order_items + defaults: + agg_time_dimension: ordered_at + description: | + Items contatined in each order. The grain of the table is one row per order item. + model: ref('order_items') + entities: + - name: order_item + type: primary + expr: order_item_id + - name: order_id + type: foreign + expr: order_id + - name: product + type: foreign + expr: product_id + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + type: time + type_params: + time_granularity: day + - name: is_food_item + type: categorical + - name: is_drink_item + type: categorical + measures: + - name: revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: product_price + - name: food_revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: case when is_food_item = 1 then product_price else 0 end + - name: drink_revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: case when is_drink_item = 1 then product_price else 0 end + - name: median_revenue + description: The median revenue generated for each order item. + agg: median + expr: product_price + ``` + +3. 📏 Finally, Let's **build a simple revenue metric** on top of our semantic model now. + + ```YAML + metrics: + - name: revenue + description: Sum of the product revenue for each order item. Excludes tax. + type: simple + label: Revenue + type_params: + measure: revenue + ``` + +## Checking our work + +- 🔍 We always will start our **auditing** with a `dbt parse && mf validate-configs` to **ensure our code works** before we examine its output. +- 👯 If we're working there, we'll move to trying out an `mf query` that **replicates the logic of the output** we're trying to refactor. +- 💸 For our example we want to **audit monthly revenue**, to do that we'd run the query below. You can [read more about the MetricFlow CLI](https://docs.getdbt.com/docs/build/metricflow-cli). + +### Example query + +```shell +mf query --metrics revenue --group-by metric_time__month +``` + +### Example query results + +```shell +✔ Success 🦄 - query completed after 1.02 seconds +| METRIC_TIME__MONTH | REVENUE | +|:---------------------|----------:| +| 2016-09-01 00:00:00 | 17032.00 | +| 2016-10-01 00:00:00 | 20684.00 | +| 2016-11-01 00:00:00 | 26338.00 | +| 2016-12-01 00:00:00 | 10685.00 | +``` + +- Try introducing some other dimensions from the semantic models into the `group-by` arguments to get a feel for this command. + +## An alternate approach + +If you **don't have capacity to refactor** some of your marts, they can **still benefit from the Semantic Layer**. The above process is about **maximizing dimensionality** for the long term. In the short term, making your **marts as-is available to MetricFlow** unlocks greatly increased functionality. For an example of this quicker approach check out the `customers` SQL and YAML files on the `main` branch. This displays a **typical denormalized dbt mart** being hooked into MetricFlow. diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md new file mode 100644 index 00000000000..fe7438b5800 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md @@ -0,0 +1,79 @@ +--- +title: "More advanced metrics" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## More advanced metric types + +We're not limited to just passing measures through to our metrics, we can also _combine_ measures to model more advanced metrics. + +- 🍊 **Ratio** metrics are, as the name implies, about **comparing two metrics as a numerator and a denominator** to form a new metric, for instance the percentage of order items that are food items instead of drinks. +- 🧱 **Derived** metrics are when we want to **write an expression** that calculates a metric **using multiple metrics**. A classic example here is our gross profit calculated by subtracting costs from revenue. +- ➕ **Cumulative** metrics calculate all of a **measure over a given window**, such as the past week, or if no window is supplied, the all-time total of that measure. + +## Ratio metrics + +- 🔢 We need to establish one measure that will be our **numerator**, and one that will be our **denominator**. +- 🥪 Let's calculate the **percentage** of our Jaffle Shop revenue that **comes from food items**. +- 💰 We already have our denominator, revenue, but we'll want to **make a new metric for our numerator** called `food_revenue`. + +```YAML + - name: food_revenue + description: The revenue from food in each order. + label: Food Revenue + type: simple + type_params: + measure: revenue + filter: | + {{ Dimension('order__is_food_order') }} = true +``` + +- 📝 Now we can set up our ratio metric. + +```YAML +- name: food_revenue_pct + description: The % of order revenue from food. + label: Food Revenue % + type: ratio + type_params: + numerator: food_revenue + denominator: revenue +``` + +## Derived metrics + +- 🆙 Now let's really have some fun. One of the most important metrics for any business is not just revenue, but _revenue growth_. Let's use a derived metric to build month-over-month revenue. +- ⚙️ A derived metric has a couple key components: + - 📚 A list of metrics to build on. These can be manipulated and filtered in various way, here we'll use the `offset_window` property to lag by a month. + - 🧮 An expression that performs a calculation with these metrics. +- With these parts we can assemble complex logic that would otherwise need to be 'frozen' in logical models. + +```YAML +- name: revenue_growth_mom + description: "Percentage growth of revenue compared to 1 month ago. Excluded tax" + type: derived + label: Revenue Growth % M/M + type_params: + expr: (current_revenue - revenue_prev_month) * 100 / revenue_prev_month + metrics: + - name: revenue + alias: current_revenue + - name: revenue + offset_window: 1 month + alias: revenue_prev_month +``` + +## Cumulative metrics + +- ➕ Lastly, lets build a **cumulative metric**. In keeping with our theme of business priorities, let's continue with revenue and build an **all-time revenue metric** for any given time window. +- 🪟 All we need to do is indicate the type is `cumulative` and not supply a `window` in the `type_params`, which indicates we want cumulative for the entire time period our end users select. + +```YAML +- name: cumulative_revenue + description: The cumulative revenue for all orders. + label: Cumulative Revenue (All Time) + type: cumulative + type_params: + measure: revenue +``` diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md new file mode 100644 index 00000000000..a1062721177 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md @@ -0,0 +1,34 @@ +--- +title: "Best practices" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## Putting it all together + +- 📊 We've **created semantic models and metrics** for basic coverage of a key business area. +- 🔁 In doing so we've **refactored a 'static' mart** into a dynamic, flexible new life in the Semantic Layer. +- 🗺️ We encourage you to **explore the `main` branch** of the [example project repo](https://github.com/dbt-labs/jaffle-sl-template) to see even more metrics and semantic models in action within a project fully ported to the Semantic Layer. + +## Best practices + +- ✅ **Prefer normalization** when possible to allow MetricFlow to denormalize dynamically for end users. +- ✅ Use **marts to denormalize** when needed, for instance grouping tables together into richer components, or getting measures on dimensional tables attached to a table with a time spine. +- ✅ When source data is **well normalized** you can **build semantic models on top of staging models**. +- ✅ **Prefer** computing values in **measures and metrics** when possible as opposed to in fixed marts. +- ❌ **Don't directly refactor the code you have in production**, build in parallel so you can audit the Semantic Layer output and deprecate old marts gracefully. + +## Key commands + +- 🔑 Use `dbt parse && mf validate-configs` to generate a semantic manifest and ensure it works with your data. +- 🔑 Use `mf list dimensions --metrics [metric name]` to check that you're increasing dimensionality as you progress. +- 🔑 Use `mf query [query options]` to preview the output from your metrics as you develop. + +## Next steps + +- 🗺️ Map out a clear plan for your dbt project to **incrementally adopt the Semantic Layer**. +- 🤗 Get involved in the community and ask questions, **help craft best practices**, and share your progress in building a dbt Semantic Layer. + +The dbt Semantic Layer is the biggest paradigm shift thus far in the young practice of analytics engineering. It's ready to provide value right away, but is most impactful if you move your project towards increasing normalization, and allow MetricFlow to do the denormalization for you with maximum dimensionality. + +We will be releasing more resources soon covering implementation of the Semantic Layer in dbt Cloud with various integrated BI tools. This is just the beginning, hopefully this guide has given you a path forward for building your data platform in this new era. diff --git a/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md b/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md index 1bbb628b73d..d1e78231e57 100644 --- a/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md +++ b/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md @@ -14,9 +14,9 @@ Building a great dbt project is an inherently collaborative endeavor, bringing t Famously, Steve Jobs [wore the same outfit everyday](https://images.squarespace-cdn.com/content/v1/5453c539e4b02ab5398ffc8f/1580381503218-E56FQDNFL1P4OBLQWHWW/ke17ZwdGBToddI8pDm48kJKedFpub2aPqa33K4gNUDwUqsxRUqqbr1mOJYKfIPR7LoDQ9mXPOjoJoqy81S2I8N_N4V1vUb5AoIIIbLZhVYxCRW4BPu10St3TBAUQYVKcxb5ZTIyC_D49_DDQq2Sj8YVGtM7O1i4h5tvKa2lazN4nGUQWMS_WcPM-ztWbVr-c/steve_jobs_outfit.jpg) to reduce decision fatigue. You can think of this guide similarly, as a black turtleneck and New Balance sneakers for your company’s dbt project. A dbt project’s power outfit, or more accurately its structure, is composed not of fabric but of files, folders, naming conventions, and programming patterns. How you label things, group them, split them up, or bring them together — the system you use to organize the [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) encoded in your dbt project — this is your project’s structure. -This guide is just a starting point. You may decide that you prefer Birkenstocks or a purple hoodie for your project over Jobs-ian minimalism. That's fine. What's important is that you think through the reasoning for those changes in your organization, explicitly declare them in a thorough, accessible way for all contributors, and above all *stay consistent*. +This guide is just a starting point. You may decide that you prefer Birkenstocks or a purple hoodie for your project over Jobs-ian minimalism. That's fine. What's important is that you think through the reasoning for those changes in your organization, explicitly declare them in a thorough, accessible way for all contributors, and above all _stay consistent_. -One foundational principle that applies to all dbt projects though, is the need to establish a cohesive arc moving data from *source-conformed* to *business-conformed*. Source-conformed data is shaped by external systems out of our control, while business-conformed data is shaped by the needs, concepts, and definitions we create. No matter what patterns or conventions you define within your project, this process remains the essential purpose of the transformation layer, and dbt as your tool within it. This guide is an update to a seminal analytics engineering [post of the same name](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) by the great Claire Carroll, and while some of the details have changed over time (as anticipated in that post) this fundamental trajectory holds true. Moving forward, this guide will be iteratively updated as new tools expand our viewpoints, new experiences sharpen our vision, and new voices strengthen our perspectives, but always in service of that aim. +One foundational principle that applies to all dbt projects though, is the need to establish a cohesive arc moving data from _source-conformed_ to _business-conformed_. Source-conformed data is shaped by external systems out of our control, while business-conformed data is shaped by the needs, concepts, and definitions we create. No matter what patterns or conventions you define within your project, this process remains the essential purpose of the transformation layer, and dbt as your tool within it. This guide is an update to a seminal analytics engineering [post of the same name](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) by the great Claire Carroll, and while some of the details have changed over time (as anticipated in that post) this fundamental trajectory holds true. Moving forward, this guide will be iteratively updated as new tools expand our viewpoints, new experiences sharpen our vision, and new voices strengthen our perspectives, but always in service of that aim. ### Learning goals @@ -24,7 +24,7 @@ This guide has three main goals: - Thoroughly cover our most up-to-date recommendations on how to structure typical dbt projects - Illustrate these recommendations with comprehensive examples -- At each stage, explain *why* we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs +- At each stage, explain _why_ we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs You should walk away from this guide with a deeper mental model of how the components of a dbt project fit together, such that purpose and principles of analytics engineering feel more clear and intuitive. @@ -33,7 +33,7 @@ By approaching our structure intentionally, we’ll gain a better understanding Our hope is that by deepening your sense of the connections between these patterns and the principles they flow from, you'll be able to translate them to fit your specific needs and craft customized documentation for your team to act on. :::info Example project. -This guide walks through our recommendations using a very simple dbt project — similar to the one used for the Getting Started guide and many other demos — from a fictional company called the Jaffle Shop. You can read more about [jaffles](https://en.wiktionary.org/wiki/jaffle) if you want (they *are* a real thing), but that context isn’t important to understand the structure. We encourage you to follow along, try things out, make changes, and take notes on what works or doesn't work for you along the way. +This guide walks through our recommendations using a very simple dbt project — similar to the one used for the Getting Started guide and many other demos — from a fictional company called the Jaffle Shop. You can read more about [jaffles](https://en.wiktionary.org/wiki/jaffle) if you want (they _are_ a real thing), but that context isn’t important to understand the structure. We encourage you to follow along, try things out, make changes, and take notes on what works or doesn't work for you along the way. ::: We'll get a deeper sense of our project as we move through the guide, but for now we just need to know that the Jaffle Shop is a restaurant selling jaffles that has two main data sources: @@ -46,17 +46,17 @@ We'll get a deeper sense of our project as we move through the guide, but for no We'll walk through our topics in the same order that our data would move through transformation: 1. Dig into how we structure the files, folders, and models for our three primary layers in the `models` directory, which build on each other: - 1. **Staging** — creating our atoms, our initial modular building blocks, from source data - 2. **Intermediate** — stacking layers of logic with clear and specific purposes to prepare our staging models to join into the entities we want - 3. **Marts** — bringing together our modular pieces into a wide, rich vision of the entities our organization cares about + 1. **Staging** — creating our atoms, our initial modular building blocks, from source data + 2. **Intermediate** — stacking layers of logic with clear and specific purposes to prepare our staging models to join into the entities we want + 3. **Marts** — bringing together our modular pieces into a wide, rich vision of the entities our organization cares about 2. Explore how these layers fit into the rest of the project: - 1. Review the overall structure comprehensively - 2. Expand on YAML configuration in-depth - 3. Discuss how to use the other folders in a dbt project: `tests`, `seeds`, and `analyses` + 1. Review the overall structure comprehensively + 2. Expand on YAML configuration in-depth + 3. Discuss how to use the other folders in a dbt project: `tests`, `seeds`, and `analyses` Below is the complete file tree of the project we’ll be working through. Don’t worry if this looks like a lot of information to take in at once - this is just to give you the full vision of what we’re building towards. We’ll focus in on each of the sections one by one as we break down the project’s structure. -```markdown +```shell jaffle_shop ├── README.md ├── analyses diff --git a/website/docs/guides/best-practices/how-we-structure/2-staging.md b/website/docs/guides/best-practices/how-we-structure/2-staging.md index a14c5c8992b..bcb589508e5 100644 --- a/website/docs/guides/best-practices/how-we-structure/2-staging.md +++ b/website/docs/guides/best-practices/how-we-structure/2-staging.md @@ -14,7 +14,7 @@ We'll use an analogy for working with dbt throughout this guide: thinking modula Let's zoom into the staging directory from our `models` file tree [in the overview](/guides/best-practices/how-we-structure/1-guide-overview) and walk through what's going on here. -```markdown +```shell models/staging ├── jaffle_shop │ ├── _jaffle_shop__docs.md @@ -36,7 +36,7 @@ models/staging - ❌ **Subdirectories based on loader.** Some people attempt to group by how the data is loaded (Fivetran, Stitch, custom syncs), but this is too broad to be useful on a project of any real size. - ❌ **Subdirectories based on business grouping.** Another approach we recommend against is splitting up by business groupings in the staging layer, and creating subdirectories like 'marketing', 'finance', etc. A key goal of any great dbt project should be establishing a single source of truth. By breaking things up too early, we open ourselves up to creating overlap and conflicting definitions (think marketing and financing having different fundamental tables for orders). We want everybody to be building with the same set of atoms, so in our experience, starting our transformations with our staging structure reflecting the source system structures is the best level of grouping for this step. - **File names.** Creating a consistent pattern of file naming is [crucial in dbt](https://docs.getdbt.com/blog/on-the-importance-of-naming). File names must be unique and correspond to the name of the model when selected and created in the warehouse. We recommend putting as much clear information into the file name as possible, including a prefix for the layer the model exists in, important grouping information, and specific information about the entity or transformation in the model. - - ✅ `stg_[source]__[entity]s.sql` - the double underscore between source system and entity helps visually distinguish the separate parts in the case of a source name having multiple words. For instance, `google_analytics__campaigns` is always understandable, whereas to somebody unfamiliar `google_analytics_campaigns` could be `analytics_campaigns` from the `google` source system as easily as `campaigns` from the `google_analytics` source system. Think of it like an [oxford comma](https://www.youtube.com/watch?v=P_i1xk07o4g), the extra clarity is very much worth the extra punctuation. + - ✅ `stg_[source]__[entity]s.sql` - the double underscore between source system and entity helps visually distinguish the separate parts in the case of a source name having multiple words. For instance, `google_analytics__campaigns` is always understandable, whereas to somebody unfamiliar `google_analytics_campaigns` could be `analytics_campaigns` from the `google` source system as easily as `campaigns` from the `google_analytics` source system. Think of it like an [oxford comma](https://www.youtube.com/watch?v=P_i1xk07o4g), the extra clarity is very much worth the extra punctuation. - ❌ `stg_[entity].sql` - might be specific enough at first, but will break down in time. Adding the source system into the file name aids in discoverability, and allows understanding where a component model came from even if you aren't looking at the file tree. - ✅ **Plural.** SQL, and particularly SQL in dbt, should read as much like prose as we can achieve. We want to lean into the broad clarity and declarative nature of SQL when possible. As such, unless there’s a single order in your `orders` table, plural is the correct way to describe what is in a table with multiple rows. @@ -77,7 +77,7 @@ renamed as ( -- numerics amount as amount_cents, amount / 100.0 as amount, - + -- booleans case when status = 'successful' then true @@ -102,22 +102,23 @@ select * from renamed - ✅ **Type casting** - ✅ **Basic computations** (e.g. cents to dollars) - ✅ **Categorizing** (using conditional logic to group values into buckets or booleans, such as in the `case when` statements above) - - ❌ **Joins** — the goal of staging models is to clean and prepare individual source conformed concepts for downstream usage. We're creating the most useful version of a source system table, which we can use as a new modular component for our project. In our experience, joins are almost always a bad idea here — they create immediate duplicated computation and confusing relationships that ripple downstream — there are occasionally exceptions though (see [base models](guides/best-practices/how-we-structure/2-staging#staging-other-considerations) below). + - ❌ **Joins** — the goal of staging models is to clean and prepare individual source-conformed concepts for downstream usage. We're creating the most useful version of a source system table, which we can use as a new modular component for our project. In our experience, joins are almost always a bad idea here — they create immediate duplicated computation and confusing relationships that ripple downstream — there are occasionally exceptions though (refer to [base models](#staging-other-considerations) for more info). - ❌ **Aggregations** — aggregations entail grouping, and we're not doing that at this stage. Remember - staging models are your place to create the building blocks you’ll use all throughout the rest of your project — if we start changing the grain of our tables by grouping in this layer, we’ll lose access to source data that we’ll likely need at some point. We just want to get our individual concepts cleaned and ready for use, and will handle aggregating values downstream. -- ✅ **Materialized as views.** Looking at a partial view of our `dbt_project.yml` below, we can see that we’ve configured the entire staging directory to be materialized as views. As they’re not intended to be final artifacts themselves, but rather building blocks for later models, staging models should typically be materialized as views for two key reasons: +- ✅ **Materialized as views.** Looking at a partial view of our `dbt_project.yml` below, we can see that we’ve configured the entire staging directory to be materialized as views. As they’re not intended to be final artifacts themselves, but rather building blocks for later models, staging models should typically be materialized as views for two key reasons: + - Any downstream model (discussed more in [marts](/guides/best-practices/how-we-structure/4-marts)) referencing our staging models will always get the freshest data possible from all of the component views it’s pulling together and materializing - It avoids wasting space in the warehouse on models that are not intended to be queried by data consumers, and thus do not need to perform as quickly or efficiently ```yaml # dbt_project.yml - + models: jaffle_shop: staging: +materialized: view ``` -- Staging models are the only place we'll use the [`source` macro](/docs/build/sources), and our staging models should have a 1-to-1 relationship to our source tables. That means for each source system table we’ll have a single staging model referencing it, acting as its entry point — *staging* it — for use downstream. +- Staging models are the only place we'll use the [`source` macro](/docs/build/sources), and our staging models should have a 1-to-1 relationship to our source tables. That means for each source system table we’ll have a single staging model referencing it, acting as its entry point — _staging_ it — for use downstream. :::tip Don’t Repeat Yourself. Staging models help us keep our code DRY. dbt's modular, reusable structure means we can, and should, push any transformations that we’ll always want to use for a given component model as far upstream as possible. This saves us from potentially wasting code, complexity, and compute doing the same transformation more than once. For instance, if we know we always want our monetary values as floats in dollars, but the source system is integers and cents, we want to do the division and type casting as early as possible so that we can reference it rather than redo it repeatedly downstream. @@ -128,94 +129,96 @@ This is a welcome change for many of us who have become used to applying the sam ### Staging: Other considerations - **Base models when joins are necessary to stage concepts.** Sometimes, in order to maintain a clean and DRY staging layer we do need to implement some joins to create a solid concept for our building blocks. In these cases, we recommend creating a sub-directory in the staging directory for the source system in question and building `base` models. These have all the same properties that would normally be in the staging layer, they will directly source the raw data and do the non-joining transformations, then in the staging models we’ll join the requisite base models. The most common use cases for building a base layer under a staging folder are: + - ✅ **Joining in separate delete tables**. Sometimes a source system might store deletes in a separate table. Typically we’ll want to make sure we can mark or filter out deleted records for all our component models, so we’ll need to join these delete records up to any of our entities that follow this pattern. This is the example shown below to illustrate. ```sql -- base_jaffle_shop__customers.sql - + with - + source as ( - + select * from {{ source('jaffle_shop','customers') }} - + ), - + customers as ( - + select id as customer_id, first_name, last_name - + from source - + ) - + select * from customers ``` ```sql -- base_jaffle_shop__deleted_customers.sql - + with - + source as ( - + select * from {{ source('jaffle_shop','customer_deletes') }} - + ), - + deleted_customers as ( - + select id as customer_id, deleted as deleted_at - + from source - + ) - + select * from deleted_customers ``` ```sql -- stg_jaffle_shop__customers.sql - + with - + customers as ( - + select * from {{ ref('base_jaffle_shop__customers') }} - + ), - + deleted_customers as ( - + select * from {{ ref('base_jaffle_shop__deleted_customers') }} - + ), - + join_and_mark_deleted_customers as ( - + select customers.*, case when deleted_customers.deleted_at is not null then true else false end as is_deleted - + from customers - + left join deleted_customers on customers.customer_id = deleted_customers.customer_id - + ) - + select * from join_and_mark_deleted_customers ``` - - ✅ **Unioning disparate but symmetrical sources**. A typical example here would be if you operate multiple ecommerce platforms in various territories via a SaaS platform like Shopify. You would have perfectly identical schemas, but all loaded separately into your warehouse. In this case, it’s easier to reason about our orders if *all* of our shops are unioned together, so we’d want to handle the unioning in a base model before we carry on with our usual staging model transformations on the (now complete) set — you can dig into [more detail on this use case here](https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921). -- **[Codegen](https://github.com/dbt-labs/dbt-codegen) to automate staging table generation.** It’s very good practice to learn to write staging models by hand, they’re straightforward and numerous, so they can be an excellent way to absorb the dbt style of writing SQL. Also, we’ll invariably find ourselves needing to add special elements to specific models at times — for instance, in one of the situations above that require base models — so it’s helpful to deeply understand how they work. Once that understanding is established though, because staging models are built largely following the same rote patterns and need to be built 1-to-1 for each source table in a source system, it’s preferable to start automating their creation. For this, we have the [codegen](https://github.com/dbt-labs/dbt-codegen) package. This will let you automatically generate all the source YAML and staging model boilerplate to speed up this step, and we recommend using it in every project. + - ✅ **Unioning disparate but symmetrical sources**. A typical example here would be if you operate multiple ecommerce platforms in various territories via a SaaS platform like Shopify. You would have perfectly identical schemas, but all loaded separately into your warehouse. In this case, it’s easier to reason about our orders if _all_ of our shops are unioned together, so we’d want to handle the unioning in a base model before we carry on with our usual staging model transformations on the (now complete) set — you can dig into [more detail on this use case here](https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921). + +- **[Codegen](https://github.com/dbt-labs/dbt-codegen) to automate staging table generation.** It’s very good practice to learn to write staging models by hand, they’re straightforward and numerous, so they can be an excellent way to absorb the dbt style of writing SQL. Also, we’ll invariably find ourselves needing to add special elements to specific models at times — for instance, in one of the situations above that require base models — so it’s helpful to deeply understand how they work. Once that understanding is established though, because staging models are built largely following the same rote patterns and need to be built 1-to-1 for each source table in a source system, it’s preferable to start automating their creation. For this, we have the [codegen](https://github.com/dbt-labs/dbt-codegen) package. This will let you automatically generate all the source YAML and staging model boilerplate to speed up this step, and we recommend using it in every project. - **Utilities folder.** While this is not in the `staging` folder, it’s useful to consider as part of our fundamental building blocks. The `models/utilities` directory is where we can keep any general purpose models that we generate from macros or based on seeds that provide tools to help us do our modeling, rather than data to model itself. The most common use case is a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source) generated with [the dbt utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). :::info Development flow versus DAG order. diff --git a/website/docs/guides/best-practices/how-we-structure/3-intermediate.md b/website/docs/guides/best-practices/how-we-structure/3-intermediate.md index 5e1db61c49f..0cf44d3cccc 100644 --- a/website/docs/guides/best-practices/how-we-structure/3-intermediate.md +++ b/website/docs/guides/best-practices/how-we-structure/3-intermediate.md @@ -12,7 +12,7 @@ Once we’ve got our atoms ready to work with, we’ll set about bringing them t Let’s take a look at the intermediate layer of our project to understand the purpose of this stage more concretely. -```markdown +```shell models/intermediate └── finance ├── _int_finance__models.yml @@ -22,10 +22,10 @@ models/intermediate - **Folders** - ✅ **Subdirectories based on business groupings.** Much like the staging layer, we’ll house this layer of models inside their own `intermediate` subfolder. Unlike the staging layer, here we shift towards being business-conformed, splitting our models up into subdirectories not by their source system, but by their area of business concern. - **File names** - - `✅ int_[entity]s_[verb]s.sql` - the variety of transformations that can happen inside of the intermediate layer makes it harder to dictate strictly how to name them. The best guiding principle is to think about *verbs* (e.g. `pivoted`, `aggregated_to_user`, `joined`, `fanned_out_by_quantity`, `funnel_created`, etc.) in the intermediate layer. In our example project, we use an intermediate model to pivot payments out to the order grain, so we name our model `int_payments_pivoted_to_orders`. It’s easy for anybody to quickly understand what’s happening in that model, even if they don’t know [SQL](https://mode.com/sql-tutorial/). That clarity is worth the long file name. It’s important to note that we’ve dropped the double underscores at this layer. In moving towards business-conformed concepts, we no longer need to separate a system and an entity and simply reference the unified entity if possible. In cases where you need intermediate models to operate at the source system level (e.g. `int_shopify__orders_summed`, `int_core__orders_summed` which you would later union), you’d preserve the double underscores. Some people like to separate the entity and verbs with double underscores as well. That’s a matter of preference, but in our experience, there is often an intrinsic connection between entities and verbs in this layer that make that difficult to maintain. + - `✅ int_[entity]s_[verb]s.sql` - the variety of transformations that can happen inside of the intermediate layer makes it harder to dictate strictly how to name them. The best guiding principle is to think about _verbs_ (e.g. `pivoted`, `aggregated_to_user`, `joined`, `fanned_out_by_quantity`, `funnel_created`, etc.) in the intermediate layer. In our example project, we use an intermediate model to pivot payments out to the order grain, so we name our model `int_payments_pivoted_to_orders`. It’s easy for anybody to quickly understand what’s happening in that model, even if they don’t know [SQL](https://mode.com/sql-tutorial/). That clarity is worth the long file name. It’s important to note that we’ve dropped the double underscores at this layer. In moving towards business-conformed concepts, we no longer need to separate a system and an entity and simply reference the unified entity if possible. In cases where you need intermediate models to operate at the source system level (e.g. `int_shopify__orders_summed`, `int_core__orders_summed` which you would later union), you’d preserve the double underscores. Some people like to separate the entity and verbs with double underscores as well. That’s a matter of preference, but in our experience, there is often an intrinsic connection between entities and verbs in this layer that make that difficult to maintain. :::tip Don’t over-optimize too early! -The example project is very simple for illustrative purposes. This level of division in our post-staging layers is probably unnecessary when dealing with these few models. Remember, our goal is a *single* *source of truth.* We don’t want finance and marketing operating on separate `orders` models, we want to use our dbt project as a means to bring those definitions together! As such, don’t split and optimize too early. If you have less than 10 marts models and aren’t having problems developing and using them, feel free to forego subdirectories completely (except in the staging layer, where you should always implement them as you add new source systems to your project) until the project has grown to really need them. Using dbt is always about bringing simplicity to complexity. +The example project is very simple for illustrative purposes. This level of division in our post-staging layers is probably unnecessary when dealing with these few models. Remember, our goal is a _single_ _source of truth._ We don’t want finance and marketing operating on separate `orders` models, we want to use our dbt project as a means to bring those definitions together! As such, don’t split and optimize too early. If you have less than 10 marts models and aren’t having problems developing and using them, feel free to forego subdirectories completely (except in the staging layer, where you should always implement them as you add new source systems to your project) until the project has grown to really need them. Using dbt is always about bringing simplicity to complexity. ::: ### Intermediate: Models @@ -36,27 +36,27 @@ Below is the lone intermediate model from our small example project. This repres -- int_payments_pivoted_to_orders.sql {%- set payment_methods = ['bank_transfer','credit_card','coupon','gift_card'] -%} - -with + +with payments as ( select * from {{ ref('stg_stripe__payments') }} ), - + pivot_and_aggregate_payments_to_order_grain as ( - + select - order_id, + order_id, {% for payment_method in payment_methods -%} - + sum( case when payment_method = '{{ payment_method }}' and - status = 'success' - then amount - else 0 + status = 'success' + then amount + else 0 end ) as {{ payment_method }}_amount, @@ -68,7 +68,7 @@ pivot_and_aggregate_payments_to_order_grain as ( group by 1 ) - + select * from pivot_and_aggregate_payments_to_order_grain ``` @@ -77,15 +77,15 @@ select * from pivot_and_aggregate_payments_to_order_grain - ✅ **Materialized as views in a custom schema with special permissions.** A more robust option is to materialize your intermediate models as views in a specific [custom schema](/docs/build/custom-schemas), outside of your main production schema. This gives you added insight into development and easier troubleshooting as the number and complexity of your models grows, while remaining easy to implement and taking up negligible space. :::tip Keep your warehouse tidy! -There are three interfaces to the organizational knowledge graph we’re encoding into dbt: the DAG, the files and folder structure of our codebase, and the output into the warehouse. As such, it’s really important that we consider that output intentionally! Think of the schemas, tables, and views we’re creating in the warehouse as *part of the UX,* in addition to the dashboards, ML, apps, and other use cases you may be targeting for the data. Ensuring that our output is named and grouped well, and that models not intended for broad use are either not materialized or built into special areas with specific permissions is crucial to achieving this. +There are three interfaces to the organizational knowledge graph we’re encoding into dbt: the DAG, the files and folder structure of our codebase, and the output into the warehouse. As such, it’s really important that we consider that output intentionally! Think of the schemas, tables, and views we’re creating in the warehouse as _part of the UX,_ in addition to the dashboards, ML, apps, and other use cases you may be targeting for the data. Ensuring that our output is named and grouped well, and that models not intended for broad use are either not materialized or built into special areas with specific permissions is crucial to achieving this. ::: - Intermediate models’ purposes, as these serve to break up complexity from our marts models, can take as many forms as [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) might require. Some of the most common use cases of intermediate models include: - + - ✅ **Structural simplification.** Bringing together a reasonable number (typically 4 to 6) of entities or concepts (staging models, or perhaps other intermediate models) that will be joined with another similarly purposed intermediate model to generate a mart — rather than have 10 joins in our mart, we can join two intermediate models that each house a piece of the complexity, giving us increased readability, flexibility, testing surface area, and insight into our components. - ✅ **Re-graining.** Intermediate models are often used to fan out or collapse models to the right composite grain — if we’re building a mart for `order_items` that requires us to fan out our `orders` based on the `quantity` column, creating a new single row for each item, this would be ideal to do in a specific intermediate model to maintain clarity in our mart and more easily view that our grain is correct before we mix it with other components. - ✅ **Isolating complex operations.** It’s helpful to move any particularly complex or difficult to understand pieces of logic into their own intermediate models. This not only makes them easier to refine and troubleshoot, but simplifies later models that can reference this concept in a more clearly readable way. For example, in the `quantity` fan out example above, we benefit by isolating this complex piece of logic so we can quickly debug and thoroughly test that transformation, and downstream models can reference `order_items` in a way that’s intuitively easy to grasp. :::tip Narrow the DAG, widen the tables. -Until we get to the marts layer and start building our various outputs, we ideally want our DAG to look like an arrowhead pointed right. As we move from source-conformed to business-conformed, we’re also moving from numerous, narrow, isolated concepts to fewer, wider, joined concepts. We’re bringing our components together into wider, richer concepts, and that creates this shape in our DAG. This way when we get to the marts layer we have a robust set of components that can quickly and easily be put into any configuration to answer a variety of questions and serve specific needs. One rule of thumb to ensure you’re following this pattern on an individual model level is allowing multiple *inputs* to a model, but **not** multiple *outputs*. Several arrows going *into* our post-staging models is great and expected, several arrows coming *out* is a red flag. There are absolutely situations where you need to break this rule, but it’s something to be aware of, careful about, and avoid when possible. +Until we get to the marts layer and start building our various outputs, we ideally want our DAG to look like an arrowhead pointed right. As we move from source-conformed to business-conformed, we’re also moving from numerous, narrow, isolated concepts to fewer, wider, joined concepts. We’re bringing our components together into wider, richer concepts, and that creates this shape in our DAG. This way when we get to the marts layer we have a robust set of components that can quickly and easily be put into any configuration to answer a variety of questions and serve specific needs. One rule of thumb to ensure you’re following this pattern on an individual model level is allowing multiple _inputs_ to a model, but **not** multiple _outputs_. Several arrows going _into_ our post-staging models is great and expected, several arrows coming _out_ is a red flag. There are absolutely situations where you need to break this rule, but it’s something to be aware of, careful about, and avoid when possible. ::: diff --git a/website/docs/guides/best-practices/how-we-structure/4-marts.md b/website/docs/guides/best-practices/how-we-structure/4-marts.md index 0e22d036e58..e7a0d35c342 100644 --- a/website/docs/guides/best-practices/how-we-structure/4-marts.md +++ b/website/docs/guides/best-practices/how-we-structure/4-marts.md @@ -3,13 +3,17 @@ title: "Marts: Business-defined entities" id: "4-marts" --- -This is the layer where everything comes together and we start to arrange all of our atoms (staging models) and molecules (intermediate models) into full-fledged cells that have identity and purpose. We sometimes like to call this the *entity* *layer* or *concept layer*, to emphasize that all our marts are meant to represent a specific entity or concept at its unique grain. For instance, an order, a customer, a territory, a click event, a payment — each of these would be represented with a distinct mart, and each row would represent a discrete instance of these concepts. Unlike in a traditional Kimball star schema though, in modern data warehousing — where storage is cheap and compute is expensive — we’ll happily borrow and add any and all data from other concepts that are relevant to answering questions about the mart’s core entity. Building the same data in multiple places, as we do with `orders` in our `customers` mart example below, is more efficient in this paradigm than repeatedly rejoining these concepts (this is a basic definition of denormalization in this context). Let’s take a look at how we approach this first layer intended expressly for exposure to end users. +:::info +Our guidance here diverges if you use the dbt Semantic Layer. In a project without the Semantic Layer we recommend you denormalize heavily, per the best practices below. On the other hand, if you're using the Semantic Layer, we want to stay as normalized as possible to allow MetricFlow the most flexibility. Guidance for marts in a Semantic Layer context is on the next page. +::: + +This is the layer where everything comes together and we start to arrange all of our atoms (staging models) and molecules (intermediate models) into full-fledged cells that have identity and purpose. We sometimes like to call this the _entity_ _layer_ or _concept layer_, to emphasize that all our marts are meant to represent a specific entity or concept at its unique grain. For instance, an order, a customer, a territory, a click event, a payment — each of these would be represented with a distinct mart, and each row would represent a discrete instance of these concepts. Unlike in a traditional Kimball star schema though, in modern data warehousing — where storage is cheap and compute is expensive — we’ll happily borrow and add any and all data from other concepts that are relevant to answering questions about the mart’s core entity. Building the same data in multiple places, as we do with `orders` in our `customers` mart example below, is more efficient in this paradigm than repeatedly rejoining these concepts (this is a basic definition of denormalization in this context). Let’s take a look at how we approach this first layer intended expressly for exposure to end users. ### Marts: Files and folders The last layer of our core transformations is below, providing models for both `finance` and `marketing` departments. -```markdown +```shell models/marts ├── finance │ ├── _finance__models.yml @@ -24,7 +28,7 @@ models/marts ✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart `customers`, `orders`. Note that for pure marts, there should not be a time dimension (`orders_per_day`) here, that is typically best captured via metrics. -❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as *separate* concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. +❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as _separate_ concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. ### Marts: Models @@ -33,7 +37,7 @@ Finally we’ll take a look at the best practices for models within the marts di ```sql -- orders.sql -with +with orders as ( @@ -68,7 +72,7 @@ select * from orders_and_payments_joined ```sql -- customers.sql -with +with customers as ( @@ -117,21 +121,15 @@ customers_and_customer_orders_joined as ( select * from customers_and_customer_orders_joined ``` -- ✅ **Materialized as tables or incremental models.** Once we reach the marts layer, it’s time to start building not just our logic into the warehouse, but the data itself. This gives end users much faster performance for these later models that are actually designed for their use, and saves us costs recomputing these entire chains of models every time somebody refreshes a dashboard or runs a regression in python. A good general rule of thumb regarding materialization is to always start with a view (as it takes up essentially no storage and always gives you up-to-date results), once that view takes too long to practically *query*, build it into a table, and finally once that table takes too long to *build* and is slowing down your runs, [configure it as an incremental model](https://docs.getdbt.com/docs/build/incremental-models/). As always, start simple and only add complexity as necessary. The models with the most data and compute-intensive transformations should absolutely take advantage of dbt’s excellent incremental materialization options, but rushing to make all your marts models incremental by default will introduce superfluous difficulty. We recommend reading this [classic post from Tristan on the limits of incremental modeling](https://discourse.getdbt.com/t/on-the-limits-of-incrementality/303). +- ✅ **Materialized as tables or incremental models.** Once we reach the marts layer, it’s time to start building not just our logic into the warehouse, but the data itself. This gives end users much faster performance for these later models that are actually designed for their use, and saves us costs recomputing these entire chains of models every time somebody refreshes a dashboard or runs a regression in python. A good general rule of thumb regarding materialization is to always start with a view (as it takes up essentially no storage and always gives you up-to-date results), once that view takes too long to practically _query_, build it into a table, and finally once that table takes too long to _build_ and is slowing down your runs, [configure it as an incremental model](https://docs.getdbt.com/docs/build/incremental-models/). As always, start simple and only add complexity as necessary. The models with the most data and compute-intensive transformations should absolutely take advantage of dbt’s excellent incremental materialization options, but rushing to make all your marts models incremental by default will introduce superfluous difficulty. We recommend reading this [classic post from Tristan on the limits of incremental modeling](https://discourse.getdbt.com/t/on-the-limits-of-incrementality/303). - ✅ **Wide and denormalized.** Unlike old school warehousing, in the modern data stack storage is cheap and it’s compute that is expensive and must be prioritized as such, packing these into very wide denormalized concepts that can provide everything somebody needs about a concept as a goal. - ❌ **Too many joins in one mart.** One good rule of thumb when building dbt transformations is to avoid bringing together too many concepts in a single mart. What constitutes ‘too many’ can vary. If you need to bring 8 staging models together with nothing but simple joins, that might be fine. Conversely, if you have 4 concepts you’re weaving together with some complex and computationally heavy window functions, that could be too much. You need to weigh the number of models you’re joining against the complexity of the logic within the mart, and if it’s too much to read through and build a clear mental model of then look to modularize. While this isn’t a hard rule, if you’re bringing together more than 4 or 5 concepts to create your mart, you may benefit from adding some intermediate models for added clarity. Two intermediate models that bring together three concepts each, and a mart that brings together those two intermediate models, will typically result in a much more readable chain of logic than a single mart with six joins. - ✅ **Build on separate marts thoughtfully.** While we strive to preserve a narrowing DAG up to the marts layer, once here things may start to get a little less strict. A common example is passing information between marts at different grains, as we saw above, where we bring our `orders` mart into our `customers` marts to aggregate critical order data into a `customer` grain. Now that we’re really ‘spending’ compute and storage by actually building the data in our outputs, it’s sensible to leverage previously built resources to speed up and save costs on outputs that require similar data, versus recomputing the same views and CTEs from scratch. The right approach here is heavily dependent on your unique DAG, models, and goals — it’s just important to note that using a mart in building another, later mart is okay, but requires careful consideration to avoid wasted resources or circular dependencies. :::tip Marts are entity-grained. -The most important aspect of marts is that they contain all of the useful data about a *particular entity* at a granular level. That doesn’t mean we don’t bring in lots of other entities and concepts, like tons of `user` data into our `orders` mart, we do! It just means that individual `orders` remain the core grain of our table. If we start grouping `users` and `orders` along a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source), into something like `user_orders_per_day`, we’re moving past marts into *metrics*. +The most important aspect of marts is that they contain all of the useful data about a _particular entity_ at a granular level. That doesn’t mean we don’t bring in lots of other entities and concepts, like tons of `user` data into our `orders` mart, we do! It just means that individual `orders` remain the core grain of our table. If we start grouping `users` and `orders` along a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source), into something like `user_orders_per_day`, we’re moving past marts into _metrics_. ::: ### Marts: Other considerations - **Troubleshoot via tables.** While stacking views and ephemeral models up until our marts — only building data into the warehouse at the end of a chain when we have the models we really want end users to work with — is ideal in production, it can present some difficulties in development. Particularly, certain errors may seem to be surfacing in our later models that actually stem from much earlier dependencies in our model chain (ancestor models in our DAG that are built before the model throws the errors). If you’re having trouble pinning down where or what a database error is telling you, it can be helpful to temporarily build a specific chain of models as tables so that the warehouse will throw the error where it’s actually occurring. -- **After marts: the activation layer.** In the same way that our staging models are building blocks for our marts, that also offer us direct views into specific source data, our marts are building blocks for our final outputs that also offer direct views into specific ideas. You can use marts directly, but they are equally important as components for building models in the *activation layer* after marts. This is a deep and fast-evolving topic, so we’ll cover this in a separate forthcoming guide that dives into: - - Metrics - - Reverse ETL - - Reporting and dashboards - - Data science and ML - - [Exposures](https://docs.getdbt.com/docs/build/exposures) (how we tie our dbt DAG into all of the above) diff --git a/website/docs/guides/best-practices/how-we-structure/5-semantic-layer-marts.md b/website/docs/guides/best-practices/how-we-structure/5-semantic-layer-marts.md new file mode 100644 index 00000000000..adebc4a63c7 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-structure/5-semantic-layer-marts.md @@ -0,0 +1,48 @@ +--- +title: "Marts for the Semantic Layer" +id: "5-semantic-layer-marts" +--- + +The Semantic Layer alters some fundamental principles of how you organize your project. Using dbt without the Semantic Layer necessitates creating the most useful combinations of your building block components into wide, denormalized marts. On the other hand, the Semantic Layer leverages MetricFlow to denormalize every possible combination of components we've encoded dynamically. As such we're better served to bring more normalized models through from the logical layer into the Semantic Layer to maximize flexibility. This section will assume familiarity with the best practices laid out in the [How we build our metrics](https://docs.getdbt.com/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) guide, so check that out first for a more hands-on introduction to the Semantic Layer. + +## Semantic Layer: Files and folders + +- 2️⃣ There are two major factors that alter our recommendations for the Semantic Layer: + - 📝 There is **more YAML** in the form of **semantic models and metrics**. + - ⏫ We may **use a staging model directly** if it forms a complete normalized component, and it will not have a mart at all. +- 💪 This combination means models at **both the staging and marts layer** may participate in the Semantic Layer and use **more powerful, expansive YAML configuration**. +- 🔁 Given this, for projects using the Semantic Layer we recommend a **YAML-file-per-model approach**, as below. + +```shell +models +├── marts +│   ├── customers.sql +│   ├── customers.yml +│   ├── orders.sql +│   └── orders.yml +└── staging + ├── __sources.yml + ├── stg_customers.sql + ├── stg_customers.yml + ├── stg_locations.sql + ├── stg_locations.yml + ├── stg_order_items.sql + ├── stg_order_items.yml + ├── stg_orders.sql + ├── stg_orders.yml + ├── stg_products.sql + ├── stg_products.yml + ├── stg_supplies.sql + └── stg_supplies.yml +``` + +## When to make a mart + +- ❓ If we can go directly to staging models and it's better to serve normalized models to the Semantic Layer, then when, where, and why would we make a mart? + - 🕰️ We have models that have measures but no time dimension to aggregate against. The details of this are laid out in the [Semantic Layer guide](https://docs.getdbt.com/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) but in short, we need a time dimension to aggregate against in MetricFlow. Dimensional tables that + - 🧱 We want to **materialize** our model in various ways. + - 👯 We want to **version** our model. + - 🛒 We have various related models that make more sense as **one wider component**. + - 1️⃣ We have similar models across multiple data sources that make more sense **unioned together**. + - ⌚ We have models in our project we **need to time to refactor** but want to serve up to the Semantic Layer quickly. +- 🌍 Any of the above and more are great reasons to build a mart. Analytics engineering is about **creativity and problem solving**, so these are not prescriptive rules, **there are many reasons to build marts** in any project. The most important takeaway is that you don't **_have to_** if you're using the Semantic Layer. diff --git a/website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md b/website/docs/guides/best-practices/how-we-structure/6-the-rest-of-the-project.md similarity index 93% rename from website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md rename to website/docs/guides/best-practices/how-we-structure/6-the-rest-of-the-project.md index 2a6c7399adb..4082f92b932 100644 --- a/website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md +++ b/website/docs/guides/best-practices/how-we-structure/6-the-rest-of-the-project.md @@ -1,6 +1,6 @@ --- title: "The rest of the project" -id: "5-the-rest-of-the-project" +id: "6-the-rest-of-the-project" description: The rest of the project. displayText: The rest of the project. hoverSnippet: The rest of the project. @@ -10,7 +10,7 @@ hoverSnippet: The rest of the project. So far we’ve focused on the `models` folder, the primary directory of our dbt project. Next, we’ll zoom out and look at how the rest of our project files and folders fit in with this structure, starting with how we approach YAML configuration files. -```markdown +```shell models ├── intermediate │ └── finance @@ -51,7 +51,7 @@ When structuring your YAML configuration files in a dbt project, you want to bal - YAML files don’t need unique names in the way that SQL model files do, but including the directory (instead of simply `_sources.yml` in each folder), means you can fuzzy find the right file more quickly. - We’ve recommended several different naming conventions over the years, most recently calling these `schema.yml` files. We’ve simplified to recommend that these simply be labelled based on the YAML dictionary that they contain. - If you utilize [doc blocks](https://docs.getdbt.com/docs/collaborate/documentation#using-docs-blocks) in your project, we recommend following the same pattern, and creating a `_[directory]__docs.md` markdown file per directory containing all your doc blocks for that folder of models. -- ❌ **Config per project.** Some people put *all* of their source and model YAML into one file. While you can technically do this, and while it certainly simplifies knowing what file the config you’re looking for will be in (as there is only one file), it makes it much harder to find specific configurations within that file. We recommend balancing those two concerns. +- ❌ **Config per project.** Some people put _all_ of their source and model YAML into one file. While you can technically do this, and while it certainly simplifies knowing what file the config you’re looking for will be in (as there is only one file), it makes it much harder to find specific configurations within that file. We recommend balancing those two concerns. - ⚠️ **Config per model.** On the other end of the spectrum, some people prefer to create one YAML file per model. This presents less of an issue than a single monolith file, as you can quickly search for files, know exactly where specific configurations exist, spot models without configs (and thus without tests) by looking at the file tree, and various other advantages. In our opinion, the extra files, tabs, and windows this requires creating, copying from, pasting to, closing, opening, and managing creates a somewhat slower development experience that outweighs the benefits. Defining config per directory is the most balanced approach for most projects, but if you have compelling reasons to use config per model, there are definitely some great projects that follow this paradigm. - ✅ **Cascade configs.** Leverage your `dbt_project.yml` to set default configurations at the directory level. Use the well-organized folder structure we’ve created thus far to define the baseline schemas and materializations, and use dbt’s cascading scope priority to define variations to this. For example, as below, define your marts to be materialized as tables by default, define separate schemas for our separate subfolders, and any models that need to use incremental materialization can be defined at the model level. @@ -73,12 +73,12 @@ models: ``` :::tip Define your defaults. -One of the many benefits this consistent approach to project structure confers to us is this ability to cascade default behavior. Carefully organizing our folders and defining configuration at that level whenever possible frees us from configuring things like schema and materialization in every single model (not very DRY!) — we only need to configure exceptions to our general rules. Tagging is another area this principle comes into play. Many people new to dbt will rely on tags rather than a rigorous folder structure, and quickly find themselves in a place where every model *requires* a tag. This creates unnecessary complexity. We want to lean on our folders as our primary selectors and grouping mechanism, and use tags to define groups that are *exceptions.* A folder-based selection like **`dbt build --select marts.marketing` is much simpler than trying to tag every marketing-related model, hoping all developers remember to add that tag for new models, and using `dbt build --select tag:marketing`. +One of the many benefits this consistent approach to project structure confers to us is this ability to cascade default behavior. Carefully organizing our folders and defining configuration at that level whenever possible frees us from configuring things like schema and materialization in every single model (not very DRY!) — we only need to configure exceptions to our general rules. Tagging is another area this principle comes into play. Many people new to dbt will rely on tags rather than a rigorous folder structure, and quickly find themselves in a place where every model _requires_ a tag. This creates unnecessary complexity. We want to lean on our folders as our primary selectors and grouping mechanism, and use tags to define groups that are _exceptions._ A folder-based selection like \*\*`dbt build --select marts.marketing` is much simpler than trying to tag every marketing-related model, hoping all developers remember to add that tag for new models, and using `dbt build --select tag:marketing`. ::: ### How we use the other folders -```yaml +```shell jaffle_shop ├── analyses ├── seeds @@ -88,7 +88,7 @@ jaffle_shop │ └── cents_to_dollars.sql ├── snapshots └── tests - └── assert_positive_value_for_total_amount.sql +└── assert_positive_value_for_total_amount.sql ``` We’ve focused heavily thus far on the primary area of action in our dbt project, the `models` folder. As you’ve probably observed though, there are several other folders in our project. While these are, by design, very flexible to your needs, we’ll discuss the most common use cases for these other folders to help get you started. @@ -111,6 +111,6 @@ One important, growing consideration in the analytics engineering ecosystem is h ## Final considerations -Overall, consistency is more important than any of these specific conventions. As your project grows and your experience with dbt deepens, you will undoubtedly find aspects of the above structure you want to change. While we recommend this approach for the majority of projects, every organization is unique! The only dogmatic advice we’ll put forward here is that when you find aspects of the above structure you wish to change, think intently about your reasoning and document for your team *how* and *why* you are deviating from these conventions. To that end, we highly encourage you to fork this guide and add it to your project’s README, wiki, or docs so you can quickly create and customize those artifacts. +Overall, consistency is more important than any of these specific conventions. As your project grows and your experience with dbt deepens, you will undoubtedly find aspects of the above structure you want to change. While we recommend this approach for the majority of projects, every organization is unique! The only dogmatic advice we’ll put forward here is that when you find aspects of the above structure you wish to change, think intently about your reasoning and document for your team _how_ and _why_ you are deviating from these conventions. To that end, we highly encourage you to fork this guide and add it to your project’s README, wiki, or docs so you can quickly create and customize those artifacts. Finally, we emphasize that this guide is a living document! It will certainly change and grow as dbt and dbt Labs evolve. We invite you to join in — discuss, comment, and contribute regarding suggested changes or new elements to cover. diff --git a/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md b/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md index 1ea9c064d74..8c61e63b888 100644 --- a/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md +++ b/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md @@ -6,7 +6,10 @@ id: 2-how-we-style-our-sql ## Basics - ☁️ Use [SQLFluff](https://sqlfluff.com/) to maintain these style rules automatically. - - Reference this [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) for the rules we use. + - Customize `.sqlfluff` configuration files to your needs. + - Refer to our [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) for the rules we use in our own projects. + + - Exclude files and directories by using a standard `.sqlfluffignore` file. Learn more about the syntax in the [.sqlfluffignore syntax docs](https://docs.sqlfluff.com/en/stable/configuration.html#id2). - 👻 Use Jinja comments (`{# #}`) for comments that should not be included in the compiled SQL. - ⏭️ Use trailing commas. - 4️⃣ Indents should be four spaces. @@ -22,7 +25,7 @@ id: 2-how-we-style-our-sql - 🔙 Fields should be stated before aggregates and window functions. - 🤏🏻 Aggregations should be executed as early as possible (on the smallest data set possible) before joining to another table to improve performance. -- 🔢 Ordering and grouping by a number (eg. group by 1, 2) is preferred over listing the column names (see [this classic rant](https://blog.getdbt.com/write-better-sql-a-defense-of-group-by-1/) for why). Note that if you are grouping by more than a few columns, it may be worth revisiting your model design. +- 🔢 Ordering and grouping by a number (eg. group by 1, 2) is preferred over listing the column names (see [this classic rant](https://www.getdbt.com/blog/write-better-sql-a-defense-of-group-by-1) for why). Note that if you are grouping by more than a few columns, it may be worth revisiting your model design. ## Joins diff --git a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md b/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md index 22f8e36190a..a6402e46870 100644 --- a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md +++ b/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md @@ -10,3 +10,98 @@ Now that you've seen how we style our dbt projects, it's time to build your own. ## Pre-commit hooks Lastly, to ensure your style guide's automated rules are being followed without additional mental overhead to your team, you can use [pre-commit hooks](https://pre-commit.com/) to automatically check your code for style violations (and often fix them automagically) before it's committed. This is a great way to make sure your style guide is followed by all contributors. We recommend implementing this once you've settled on and published your style guide, and your codebase is conforming to it. This will ensure that all future commits follow the style guide. You can find an excellent set of open source pre-commit hooks for dbt from the community [here in the dbt-checkpoint project](https://github.com/dbt-checkpoint/dbt-checkpoint). + +## Style guide template + +```markdown +# dbt Example Style Guide + +## SQL Style + +- Use lowercase keywords. +- Use trailing commas. + +## Model Organization + +Our models (typically) fit into two main categories:\ + +- Staging — Contains models that clean and standardize data. +- Marts — Contains models which combine or heavily transform data. + +Things to note: + +- There are different types of models that typically exist in each of the above categories. See [Model Layers](#model-layers) for more information. +- Read [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) for an example and more details around organization. + +## Model Layers + +- Only models in `staging` should select from [sources](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources). +- Models not in the `staging` folder should select from [refs](https://docs.getdbt.com/reference/dbt-jinja-functions/ref). + +## Model File Naming and Coding + +- All objects should be plural. + Example: `stg_stripe__invoices.sql` vs. `stg_stripe__invoice.sql` + +- All models should use the naming convention `___`. See [this article](https://docs.getdbt.com/blog/stakeholder-friendly-model-names) for more information. + + - Models in the **staging** folder should use the source's name as the `` and the entity name as the `additional_context`. + + Examples: + + - seed_snowflake_spend.csv + - base_stripe\_\_invoices.sql + - stg_stripe\_\_customers.sql + - stg_salesforce\_\_customers.sql + - int_customers\_\_unioned.sql + - fct_orders.sql + +- Schema, table, and column names should be in `snake_case`. + +- Limit the use of abbreviations that are related to domain knowledge. An onboarding employee will understand `current_order_status` better than `current_os`. + +- Use names based on the _business_ rather than the source terminology. + +- Each model should have a primary key to identify the unique row and should be named `_id`. For example, `account_id`. This makes it easier to know what `id` is referenced in downstream joined models. + +- For `base` or `staging` models, columns should be ordered in categories, where identifiers are first and date/time fields are at the end. +- Date/time columns should be named according to these conventions: + + - Timestamps: `_at` + Format: UTC + Example: `created_at` + + - Dates: `_date` + Format: Date + Example: `created_date` + +- Booleans should be prefixed with `is_` or `has_`. + Example: `is_active_customer` and `has_admin_access` + +- Price/revenue fields should be in decimal currency (for example, `19.99` for $19.99; many app databases store prices as integers in cents). If a non-decimal currency is used, indicate this with suffixes. For example, `price_in_cents`. + +- Avoid using reserved words (such as [these](https://docs.snowflake.com/en/sql-reference/reserved-keywords.html) for Snowflake) as column names. + +- Consistency is key! Use the same field names across models where possible. For example, a key to the `customers` table should be named `customer_id` rather than `user_id`. + +## Model Configurations + +- Model configurations at the [folder level](https://docs.getdbt.com/reference/model-configs#configuring-directories-of-models-in-dbt_projectyml) should be considered (and if applicable, applied) first. +- More specific configurations should be applied at the model level [using one of these methods](https://docs.getdbt.com/reference/model-configs#apply-configurations-to-one-model-only). +- Models within the `marts` folder should be materialized as `table` or `incremental`. + - By default, `marts` should be materialized as `table` within `dbt_project.yml`. + - If switching to `incremental`, this should be specified in the model's configuration. + +## Testing + +- At a minimum, `unique` and `not_null` tests should be applied to the expected primary key of each model. + +## CTEs + +For more information about why we use so many CTEs, read [this glossary entry](https://docs.getdbt.com/terms/cte). + +- Where performance permits, CTEs should perform a single, logical unit of work. +- CTE names should be as verbose as needed to convey what they do. +- CTEs with confusing or noteable logic should be commented with SQL comments as you would with any complex functions and should be located above the CTE. +- CTEs duplicated across models should be pulled out and created as their own models. +``` diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md b/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md index 2f6c04bd35d..54f4443b600 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md +++ b/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md @@ -53,7 +53,7 @@ def model(dbt, session): :::info -🐍 **Not all adapters support python yet**, check the [docs here to be sure](docs/build/python-models#specific-data-platforms) before spending time writing python models. +🐍 **Not all adapters support python yet**, check the [docs here to be sure](/docs/build/python-models#specific-data-platforms) before spending time writing python models. ::: - Configuring a model to materialize as a `table` is simple, and the same as a `view` for both SQL and python models. diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md b/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md index c1a4cb3eb0e..603cbc8cda1 100644 --- a/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md +++ b/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md @@ -115,7 +115,7 @@ So we’re going to use an **if statement** to apply our cutoff filter **only wh Thankfully, we don’t have to dig into the guts of dbt to sort out each of these conditions individually. -- ⚙️  dbt provides us with a **macro [`is_incremental`](docs/build/incremental-models#understanding-the-is_incremental-macro)** that checks all of these conditions for this exact use case. +- ⚙️  dbt provides us with a **macro [`is_incremental`](/docs/build/incremental-models#understanding-the-is_incremental-macro)** that checks all of these conditions for this exact use case. - 🔀  By **wrapping our cutoff logic** in this macro, it will only get applied when the macro returns true for all of the above conditions. Let’s take a look at all these pieces together: diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md index f8335dfcbc4..80b994aefb0 100644 --- a/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md @@ -8,6 +8,7 @@ If you've already [built](3-building-a-new-adapter), and [tested](4-testing-a-ne ## Making your adapter available Many community members maintain their adapter plugins under open source licenses. If you're interested in doing this, we recommend: + - Hosting on a public git provider (for example, GitHub or Gitlab) - Publishing to [PyPI](https://pypi.org/) - Adding to the list of ["Supported Data Platforms"](/docs/supported-data-platforms#community-supported) (more info below) @@ -35,17 +36,12 @@ We ask our adapter maintainers to use the [docs.getdbt.com repo](https://github. To simplify things, assume the reader of this documentation already knows how both dbt and your data platform works. There's already great material for how to learn dbt and the data platform out there. The documentation we're asking you to add should be what a user who is already profiecient in both dbt and your data platform would need to know in order to use both. Effectively that boils down to two things: how to connect, and how to configure. - ## Topics and Pages to Cover - The following subjects need to be addressed across three pages of this docs site to have your data platform be listed on our documentation. After the corresponding pull request is merged, we ask that you link to these pages from your adapter repo's `REAMDE` as well as from your product documentation. To contribute, all you will have to do make the changes listed in the table below. - - - | How To... | File to change within `/website/docs/` | Action | Info to Include | |----------------------|--------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | Connect | `/docs/core/connect-data-platform/{MY-DATA-PLATFORM}-setup.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. | @@ -55,7 +51,6 @@ The following subjects need to be addressed across three pages of this docs site For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-setup.md` and add it to the `/website/docs/core/connect-data-platform/` directory. - ## Example PRs to add new adapter documentation Below are some recent pull requests made by partners to document their data platform's adapter: diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter.md new file mode 100644 index 00000000000..9783ec66460 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter.md @@ -0,0 +1,79 @@ +--- +title: "Building a Trusted Adapter" +id: "8-building-a-trusted-adapter" +--- + +The Trusted adapter program exists to allow adapter maintainers to demonstrate to the dbt community that your adapter is trusted to be used in production. + +## What does it mean to be trusted + +By opting into the below, you agree to this, and we take you at your word. dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met. + +### Feature Completeness + +To be considered for the Trusted Adapter program, the adapter must cover the essential functionality of dbt Core given below, with best effort given to support the entire feature set. + +Essential functionality includes (but is not limited to the following features): + +- table, view, and seed materializations +- dbt tests + +The adapter should have the required documentation for connecting and configuring the adapter. The dbt docs site should be the single source of truth for this information. These docs should be kept up-to-date. + +See [Documenting a new adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) for more information. + +### Release Cadence + +Keeping an adapter up-to-date with dbt Core is an integral part of being a trusted adapter. Therefore, we ask that adapter maintainers: + +- Release of new minor versions of the adapter with all tests passing within four weeks of dbt Core's release cut. +- Release of new major versions of the adapter with all tests passing within eight weeks of dbt Core's release cut. + +### Community Responsiveness + +On a best effort basis, active participation and engagement with the dbt Community across the following forums: + +- Being responsive to feedback and supporting user enablement in dbt Community’s Slack workspace +- Responding with comments to issues raised in public dbt adapter code repository +- Merging in code contributions from community members as deemed appropriate + +### Security Practices + +Trusted adapters will not do any of the following: + +- Output to logs or file either access credentials information to or data from the underlying data platform itself. +- Make API calls other than those expressly required for using dbt features (adapters may not add additional logging) +- Obfuscate code and/or functionality so as to avoid detection + +Additionally, to avoid supply-chain attacks: + +- Use an automated service to keep Python dependencies up-to-date (such as Dependabot or similar), +- Publish directly to PyPI from the dbt adapter code repository by using trusted CI/CD process (such as GitHub actions) +- Restrict admin access to both the respective code (GitHub) and package (PyPI) repositories +- Identify and mitigate security vulnerabilities by use of a static code analyzing tool (such as Snyk) as part of a CI/CD process + +### Other considerations + +The adapter repository is: + +- open-souce licensed, +- published to PyPI, and +- automatically tests the codebase against dbt Lab's provided adapter test suite + +## How to get an adapter verified? + +Open an issue on the [docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com) using the "Add adapter to Trusted list" template. In addition to contact information, it will ask confirm that you agree to the following. + +1. my adapter meet the guidelines given above +2. I will make best reasonable effort that this continues to be so +3. checkbox: I acknowledge that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the above guidelines not be met. + +The approval workflow is as follows: + +1. create and populate the template-created issue +2. dbt Labs will respond as quickly as possible (maximally four weeks, though likely faster) +3. If approved, dbt Labs will create and merge a Pull request to formally add the adapter to the list. + +## How to get help with my trusted adapter? + +Ask your question in #adapter-ecosystem channel of the community Slack. diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md b/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md index 5da8cc6616b..a3b4be5a051 100644 --- a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md +++ b/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md @@ -35,11 +35,11 @@ Each dbt Cloud project can have multiple deployment environments, but only one d With your deployment environment set up, it's time to create a production job to run in your *prod* environment. -To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/dbt-cloud-job). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently. +To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/deploy-jobs). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently. Leveraging dbt Cloud's job scheduler allows data teams to own the entire transformation workflow. You don't need to learn and maintain additional tools for orchestration or rely on another team to schedule code written by your team. This end-to-end ownership simplifies the deployment process and accelerates the delivery of new data products. -Let’s [create a job](/docs/deploy/dbt-cloud-job#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog. +Let’s [create a job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog. 1. Create a new job by clicking **Deploy** in the header, click **Jobs** and then **Create job**. 2. **Name** the job “Daily refresh”. @@ -58,7 +58,7 @@ Let’s [create a job](/docs/deploy/dbt-cloud-job#create-and-schedule-jobs) in d - dbt build is more efficient than issuing separate commands for dbt run and dbt test separately because it will run then test each model before continuing. - We are excluding source data because we already tested it in step 2. - The fail-fast flag will make dbt exit immediately if a single resource fails to build. If other models are in-progress when the first model fails, then dbt will terminate the connections for these still-running models. -5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/job-triggers). You can enter specific days and timing or create a custom cron schedule. +5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/deploy-jobs#schedule-days). You can enter specific days and timing or create a custom cron schedule. - If you want your dbt Cloud job scheduled by another orchestrator, like Databricks Workflows, see the [Advanced Considerations](#advanced-considerations) section below. This is just one example of an all-or-nothing command list designed to minimize wasted computing. The [job command list](/docs/deploy/job-commands) and [selectors](/reference/node-selection/syntax) provide a lot of flexibility on how your DAG will execute. You may want to design yours to continue running certain models if others fail. You may want to set up multiple jobs to refresh models at different frequencies. See our [Job Creation Best Practices discourse](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) for more job design suggestions. @@ -85,7 +85,7 @@ Your CI job will ensure that the models build properly and pass any tests applie - A service principal called *dbt_test_sp* - A new dbt Cloud environment called *test* that defaults to the *test* catalog and uses the *dbt_test_sp* token in the deployment credentials -We recommend setting up a dbt Cloud Slim CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a Slim CI job, refer to [Set up Slim CI jobs](/docs/deploy/slim-ci-jobs) for details. +We recommend setting up a dbt Cloud CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a CI job, refer to [Set up CI jobs](/docs/deploy/ci-jobs) for details. With dbt tests and SlimCI, you can feel confident that your production data will be timely and accurate even while delivering at high velocity. @@ -93,7 +93,7 @@ With dbt tests and SlimCI, you can feel confident that your production data will Keeping a close eye on your dbt Cloud jobs is crucial for maintaining a robust and efficient data pipeline. By monitoring job performance and quickly identifying potential issues, you can ensure that your data transformations run smoothly. dbt Cloud provides three entry points to monitor the health of your project: run history, deployment monitor, and status tiles. -The [run history](/docs/deploy/dbt-cloud-job) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu. +The [run history](/docs/deploy/run-visibility#run-history) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu. The deployment monitor in dbt Cloud offers a higher-level view of your run history, enabling you to gauge the health of your data pipeline over an extended period of time. This feature includes information on run durations and success rates, allowing you to identify trends in job performance, such as increasing run times or more frequent failures. The deployment monitor also highlights jobs in progress, queued, and recent failures. To access the deployment monitor click on the dbt logo in the top left corner of the dbt Cloud UI. @@ -121,7 +121,6 @@ The five key steps for troubleshooting dbt Cloud issues are: 2. Inspect the problematic file and look for an immediate fix. 3. Isolate the problem by running one model at a time in the IDE or undoing the code that caused the issue. 4. Check for problems in compiled files and logs. -5. Seek help from the [dbt Cloud support team](/docs/dbt-support) if needed. Consult the [Debugging errors documentation](/guides/best-practices/debugging-errors) for a comprehensive list of error types and diagnostic methods. diff --git a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md b/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md index f2fffd43994..68037bfd0cd 100644 --- a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md +++ b/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md @@ -1,660 +1,152 @@ --- -title: "dbt Semantic Layer integration" +title: "dbt Semantic Layer integration best practices" id: "sl-partner-integration-guide" description: Learn about partner integration guidelines, roadmap, and connectivity. --- -# dbt Semantic Layer partner integration -:::info Coming soon -The dbt Semantic Layer is undergoing some sophisticated changes, enabling more complex metric definitions and efficient querying. As part of these changes, the dbt_metrics package will be deprecated and replaced with MetricFlow. For more info, check out the [The dbt Semantic Layer: what's next?](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/) and [dbt_metrics deprecation](https://docs.getdbt.com/blog/deprecating-dbt-metrics) blog. +import NewChanges from '/snippets/_new-sl-changes.md'; + + + +To fit your tool within the world of the Semantic Layer, dbt Labs offers some best practice recommendations for how to expose metrics and allow users to interact with them seamlessly. + +:::note +This is an evolving guide that is meant to provide recommendations based on our experience. If you have any feedback, we'd love to hear it! ::: -This guide is for dbt Semantic Layer integration partners and explains integration guidelines, and connectivity.
      -To become a formal partner, integrate with the API, or have questions/feedback — **[contact us](mailto:semantic-layer@dbtlabs.com)** for more info. +## Requirements -The dbt Semantic Layer allows users to dynamically generate and query datasets in downstream tools based on their dbt governed assets, such as metrics, models, and entities. It helps organizations manage complexities such as data, tools, and teams to make more efficient and trustworthy decisions. +To build a dbt Semantic Layer integration: -The rapid growth of different tools in the modern data stack has helped data professionals address the diverse needs of different teams. The downside of this growth is the fragmentation of business logic across teams, tools, and workloads. +- We offer a [JDBC](/docs/dbt-cloud-apis/sl-jdbc) API (and will soon offer a GraphQL API). Refer to the dedicated [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) for more technical integration details. -To solve this, the dbt Semantic Layer provides a platform where users can confidently leverage their data from within their tools. dbt Cloud's change management capabilities ensure that any user modifications made to core business constructs, like metrics or entities, are distributed into all the tools connected to the data platform. +- Familiarize yourself with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and [MetricFlow](/docs/build/about-metricflow)'s key concepts. There are two main objects: -The dbt Semantic Layer can be used for a variety of tools and applications of data. Here are some common use cases + - [Semantic models](/docs/build/semantic-models) — Nodes in your semantic graph, connected via entities as edges. MetricFlow takes semantic models defined in YAML configuration files as inputs and creates a semantic graph that you can use to query metrics. + - [Metrics](/docs/build/metrics-overview) — Can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo). -* Business intelligence (BI), reporting, and analytics, -* Data quality and monitoring, -* Governance and privacy, -* Data discovery and cataloging, -* Machine learning and data science. +### Connection parameters -:::info Share your use case +The dbt Semantic Layer APIs authenticate with `environmentId`, `SERVICE_TOKEN`, and `host`. -If you'd like to share other use cases for the dbt Semantic Layer, contact the [dbt Labs team](mailto:semantic-layer@dbtlabs.com). +We recommend you provide users with separate input fields with these components for authentication (dbt Cloud will surface these parameters for the user). -::: +## Best practices on exposing metrics -## Product overview +Best practices for exposing metrics are summarized into five themes: -The dbt Semantic Layer product architecture includes four primary components: +- [Governance](#governance-and-traceability) — Recommendations on how to establish guardrails for governed data work. +- [Discoverability](#discoverability) — Recommendations on how to make user-friendly data interactions. +- [Organization](#organization) — Organize metrics and dimensions for all audiences. +- [Query flexibility](#query-flexibility) — Allow users to query either one metric alone without dimensions or multiple metrics with dimensions. +- [Context and interpretation](#context-and-interpretation) — Contextualize metrics for better analysis; expose definitions, metadata, lineage, and freshness. -| Components | Information | Developer plans | Team plans | Enterprise plans | License | -| --- | --- | :---: | :---: | :---: | --- | -| **[dbt Project](/docs/build/metrics)** | Define models and metrics in dbt Core. | ✅ | ✅ | ✅ | Open source in dbt Core | -| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt Core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | -| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja-like query models and metrics, use macros), compiles the query into pure SQL, executes the query in the data platform, and returns the data. | ✅

      _* Available during Public Preview only_ | ✅ | ✅ | Proprietary in dbt Cloud | -| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, and metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary in dbt Cloud | - -Review the following current architecture to understand how the components work together: - - +### Governance and traceability +When working with more governed data, it's essential to establish clear guardrails. Here are some recommendations: -## Integration guidelines +- **Aggregations control** — Users shouldn't generally be allowed to modify aggregations unless they perform post-processing calculations on Semantic Layer data (such as year-over-year analysis). -In collaboration with dbt Labs, partners and users can build dbt Semantic Layer integrations that can import model metadata and metric definitions, query metrics, use macros, and more. +- **Time series alignment and using metric_time** — Make sure users view metrics across the correct time series. When displaying metric graphs, using a non-default time aggregation dimension might lead to misleading interpretations. While users can still group by other time dimensions, they should be careful not to create trend lines with incorrect time axes.

      When looking at one or multiple metrics, users should use `metric_time` as the main time dimension to guarantee they are looking at the right time series for the metric(s).

      As such, when building an application, we recommend exposing `metric_time` as a separate, "special" time dimension on its own. This dimension is always going to align with all metrics and be common across them. Other time dimensions can still be looked at and grouped by, but having a clear delineation between the `metric_time` dimension and the other time dimensions is clarifying so that people do not confuse how metrics should be plotted.

      Also, when a user requests a time granularity change for the main time series, the query that your application runs should use `metric_time` as this will always give you the correct slice. Related to this, we also strongly recommend that you have a way to expose what dimension `metric_time` actually maps to for users who may not be familiar. Our APIs allow you to fetch the actual underlying time dimensions that makeup metric_time (such as `transaction_date`) so you can expose them to your users. -For more details, refer to the [Integration roadmap](#integration) and [Integration best practices](#best-practices) guidance. +- **Units consistency** — If units are supported, it's vital to avoid plotting data incorrectly with different units. Ensuring consistency in unit representation will prevent confusion and misinterpretation of the data. -**Integration roadmap ** +- **Traceability of metric and dimension changes** — When users change names of metrics and dimensions for reports, it's crucial to have a traceability mechanism in place to link back to the original source metric name. -Integration partners generally build and approach their roadmap in the following stages: -| Feature | Info | Availability | -|----------|-------|:------------:| -| **Model metadata** | Import/sync model metadata (descriptions, dimensions, test, freshness, and more) via the [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api). | ✅ | -| **Metric definitions** | Import/sync metric definitions (metric calculation, dimensions, description, and more) via the [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api). | ✅ | -| **dbt Semantic Layer as a data source** | Connect to the dbt Semantic Layer as a data source (for example, the Snowflake Proxy Server). Users can execute dbt-SQL to query metrics or models and use macros.* | ✅ | -| **Query metrics** | Query the imported metrics via a metric-centric UI (for example, a user can select a metric, time grain, and dimensions of interest). | ✅ | -| **Entity definitions** | Import/sync entity definitions (descriptions, dimensions, data types, relationships, metrics, and more) and query entities via the dbt Semantic Layer. | _*Coming soon | -| **dbt Semantic Layer Connector** | A dedicated connector with the ability to query any data platform supported in dbt Cloud. (Will replace (3).) | _*Coming soon | +### Discoverability -_*The coming soon features are expected to launch in 2023. +- Consider treating [metrics](/docs/build/metrics-overview) as first-class objects rather than measures. Metrics offer a higher-level and more contextual way to interact with data, reducing the burden on end-users to manually aggregate data. -**Integration best practices ** - -To build a successful and seamless dbt Semantic Layer integration, it should express the following: - -- **Consistent**: Have a consistent user experience (UX) incorporated into existing core user workflows. -- **Trustworthy**: Treat dbt assets (metrics, models, and entities) as first-class objects and indicate that their definitions and resulting datasets come from dbt Cloud. -- **Efficient**: Provide a clear advantage over the current approach to setting up metrics and analyses, and finding dimensions/datasets in the tool. -- **Accessible**: Include a self-serve component so a data consumer can ask questions via the user interface (UI), if applicable. - - -## Use the Discovery API +- Easy metric interactions: Provide users with an intuitive approach to: + * Search for Metrics — Users should be able to easily search and find relevant metrics. Metrics can serve as the starting point to lead users into exploring dimensions. + * Search for Dimensions — Users should be able to query metrics with associated dimensions, allowing them to gain deeper insights into the data. + * Filter by Dimension Values — Expose and enable users to filter metrics based on dimension values, encouraging data analysis and exploration. + * Filter additional metadata — Allow users to filter metrics based on other available metadata, such as metric type and default time granularity. -This section will explain how to connect to and query the [Discovery API](/docs/dbt-cloud-apis/discovery-api) for model and metric definitions. - -To use the dbt Semantic Layer, you must meet the [prerequisites](/docs/use-dbt-semantic-layer/dbt-semantic-layer#prerequisites). - -
      - Discovery API authorization -
      -
      Refer to our Authorization documentation to learn how to authorize requests to the Discovery API.



      - - Metrics-specific queries work identical to existing Discovery API queries. This means existing integrations that query model metadata will work perfectly in the context of metrics. -
      -
      -
      - -
      - Query the Discovery API -
      -
      Test out the Discovery API by using the GraphQL sandbox and use this Python client as a starting point to develop. -
      -
      -
      -

      - - - -### Query models for a project - -You can query model definitions or details about a specific model for a project from a given job. - - - - - - - -This is an example of querying all models that utilize the schema`analytics` from a given job. - -``` -{ - models(jobId: 181329, schema: "analytics") { - name - status - compileCompletedAt - database - dbtVersion - runGeneratedAt - } -} -``` - - - - -``` -{ - "data": { - "models": [ - { - "name": "customers", - "status": "success", - "compileCompletedAt": "2022-12-15T06:37:24.186Z", - "database": "analytics", - "dbtVersion": "1.3.1", - "runGeneratedAt": "2022-12-15T06:37:25.187Z" - }, - { - "name": "stg_customers", - "status": "success", - "compileCompletedAt": "2022-12-15T06:37:22.509Z", - "database": "analytics", - "dbtVersion": "1.3.1", - "runGeneratedAt": "2022-12-15T06:37:25.187Z" - }, - { - "name": "stg_orders", - "status": "success", - "compileCompletedAt": "2022-12-15T06:37:22.509Z", - "database": "analytics", - "dbtVersion": "1.3.1", - "runGeneratedAt": "2022-12-15T06:37:25.187Z" - } - ] - } -} -``` - - - - -This is an example of querying details about a specific model, `model.jaffle_shop.customers`, from a given job. - -``` -{ - model(jobId: 181329, uniqueId: "model.jaffle_shop.customers") { - parentsModels { - runId - uniqueId - executionTime - } - } -} -{ - "data": { - "model": { - "parentsModels": [ - { - "runId": 105297555, - "uniqueId": "model.jaffle_shop.stg_customers", - "executionTime": 1.676571846008301 - }, - { - "runId": 105297555, - "uniqueId": "model.jaffle_shop.stg_orders", - "executionTime": 1.631831407546997 - } - ] - } - } -} -``` - - - - - - -### Query metrics for a project - -Query metrics definitions or details for a project from a given job and refer to the following resources: - -- [Metrics query](/docs/dbt-cloud-apis/discovery-schema-metrics) — Information on how to query the full list of metrics defined in a user’s project with the dbt Cloud Discovery API. -- [dbt Metrics docs](https://docs.getdbt.com/docs/build/metrics#available-properties) — Information on the available metric properties. -- [GraphQL sandbox](https://studio.apollographql.com/sandbox/explorer?endpoint=https%3A%2F%2Fmetadata.cloud.getdbt.com%2Fgraphql) — Access to test the dbt Cloud Discovery API testing environment. - - - - - - -This is an example listing metrics from a given job: - -``` -{ - metrics(jobId: 123) { - name - label - description - model - dependsOn - calculation_method - expression - timestamp - timeGrains - dimensions - window - filters - tags - meta - } -} -``` - - - - -The `metric` query supports all metric properties listed in **Listing metrics**. -The following abbreviated example is querying details about the metric `new_customers` from job `123`: - -This is an example of querying details about a specific metric `new_customers` from a given job `123`. - -``` -{ - metric(jobId: 123) { - label - calculation_method - timestamp - timeGrains - dimensions - } -} -``` - - - - - -``` -{ - "data": { - "metrics": [ - { - "uniqueId": "metric.claim_to_fame.total_claim_charges", - "name": "total_claim_charges", - "tags": [], - "label": "Total Claim Charges", - "calculation_method": "sum", - "expression": "total_charge_amount", - "timestamp": "created_at", - "timeGrains":[ - "day", - "week", - "month" - ], - "meta": {}, - "resourceType": "metric", - "model": { - "name": "fct_billed_patient_claims" - } - }, - { - "uniqueId": "metric.claim_to_fame.total_billed_diagnoses", - "name": "total_billed_diagnoses", - "tags": [], - "label": "Total Billed Diagnoses", - "calculation_method": "count_distinct", - "expression": "diagnosis_id", - "timestamp": "created_at", - "timeGrains":[ - "week", - "month", - "year" - ], - "meta": {}, - "resourceType": "metric", - "model": { - "name": "fct_billed_patient_claims" - }, - } - ] - } -} -``` - - - - - -``` -metrics: - - name: total_claim_charges - label: Total Claim Charges - model: ref('fct_billed_patient_claims') - calculation_method: sum - expression: total_charge_amount - timestamp: created_at - time_grains: [day, week, month, all_time] - - - - name: total_billed_diagnoses - label: Total Billed Diagnoses - model: ref('fct_billed_patient_claims') - calculation_method: count_distinct - expression: diagnosis_id - timestamp: created_at - time_grains: [day, week, month] -``` - - - - - - - - -## Query the dbt Semantic Layer - -This section explains how to connect to or query the dbt Semantic Layer Proxy Server to return model data, metric data, and so on. - -When you configure the dbt Semantic Layer, dbt Cloud provides a Proxy Server endpoint that users can connect to as though it's a Snowflake-hosted endpoint. Once the queries are submitted, dbt Cloud will: - -1. Compile dbt-sql queries into valid Snowflake SQL, -2. Execute the compiled SQL against the Snowflake data platform, -3. Return the results to the client. - -Replace the hostname in your existing data platform connection with the relevant dbt Cloud Proxy Server URL (for example, `abc123.proxy.cloud.getdbt.com`). All queries you submit through the endpoint will be compiled en route to the data platform.* - -*_Note: This approach will change with the new Semantic Layer connection in mid-2023, which will be able to query all data platforms supported in dbt Cloud through dedicated JDBC/ODBC drivers, and eventually an API._ - - - - +- Suggested Metrics: Ideally, the system should intelligently suggest relevant metrics to users based on their team's activities. This approach encourages user exposure, facilitates learning, and supports collaboration among team members. - +By implementing these recommendations, the data interaction process becomes more user-friendly, empowering users to gain valuable insights without the need for extensive data manipulation. -Users can compile and execute metric queries using macros defined in the [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics). This package: - -- Generates the SQL required to accurately calculate the metric definition, -- Supplies helper macros for derived calculations (like month over month, year to date, and so on) time series operations - - -``` -select * -from {{ metrics.calculate( - metric_list=[metric('customers'), metric(‘revenue’)], - grain='week', - dimensions=['plan', 'country'], - secondary_calculations=[ - metrics.period_to_date(aggregate="sum", period="year"), - metrics.rolling(aggregate="average", interval=4, alias="avg_past_4wks") - ], - start_date='2020-01-01', - end_date="date_trunc('day', getdate())" -) }} -``` +### Organization - +We recommend organizing metrics and dimensions in ways that a non-technical user can understand the data model, without needing much context: - +- **Organizing Dimensions** — To help non-technical users understand the data model better, we recommend organizing dimensions based on the entity they originated from. For example, consider dimensions like `user__country` and `product__category`.

      You can create groups by extracting `user` and `product` and then nest the respective dimensions under each group. This way, dimensions align with the entity or semantic model they belong to and make them more user-friendly and accessible. -Model queries allow users to query models and use macros from their dbt project. +- **Organizing Metrics** — The goal is to organize metrics into a hierarchy in our configurations, instead of presenting them in a long list.

      This hierarchy helps you organize metrics based on specific criteria, such as business unit or team. By providing this structured organization, users can find and navigate metrics more efficiently, enhancing their overall data analysis experience. -``` -select cents_to_dollars('amount_cents') as amount_dollars -from {{ ref('orders') }} -``` -
      -
      +### Query flexibility -### Entities - +Allow users to query either one metric alone without dimensions or multiple metrics with dimensions. -dbt Labs will introduce a new node type, **[entity](https://github.com/dbt-labs/dbt-core/issues/6379)**, when dbt Core version 1.5 launches. It introduces a new and efficient way to define metrics by reusing logic (for example, `time_grains`). +- Allow toggling between metrics/dimensions seamlessly. -Entities are semantic objects made up of curated dimensions from models with more metadata defined. Over time, users can standardize metric and entity definitions with packages to speed up development. +- Be clear on exposing what dimensions are queryable with what metrics and hide things that don’t apply, and vice versa. -For integrations, entities will provide information like: +- Only expose time granularities (monthly, daily, yearly) that match the available metrics. + * For example, if a dbt model and its resulting semantic model have a monthly granularity, make sure querying data with a 'daily' granularity isn't available to the user. Our APIs have functionality that will help you surface the correct granularities -- a way to organize metrics based on the entity they reference, and -- a new consumable and dynamically generated dataset (versus finding a table in the data platform). +- We recommend that time granularity is treated as a general time dimension-specific concept and that it can be applied to more than just the primary aggregation (or `metric_time`). Consider a situation where a user wants to look at `sales` over time by `customer signup month`; in this situation, having the ability to apply granularities to both time dimensions is crucial. Our APIs include information to fetch the granularities for the primary (metric_time) dimensions, as well as all time dimensions. You can treat each time dimension and granularity selection independently in your application. Note: Initially, as a starting point, it makes sense to only support `metric_time` or the primary time dimension, but we recommend expanding that as your solution evolves. -This information will be available alongside the Discovery API, and entities can be directly queried through the dbt Semantic Layer. +- You should allow users to filter on date ranges and expose a calendar and nice presets for filtering these. + * For example, last 30 days, last week, and so on. - +### Context and interpretation -:::caution 🚧 +For better analysis, it's best to have the context of the metrics close to where the analysis is happening. We recommend the following: -Entities are a work in progress — expect continuous changes and improvements. To stay up-to-date, refer to the [entity discussions](https://github.com/dbt-labs/dbt-core/issues/6379) page. +- Expose business definitions of the metrics as well as logical definitions. -::: +- Expose additional metadata from the Semantic layer (measures, type parameters). - - - - - - -Define entities in your dbt project. - -``` -entities: ## The top-level path of the new node - - name: [Required] ## The name of the entity - model: [Required] ## The name of the model that the entity is dependent on - description: [Optional] ## The description of the entity - - dimensions: [Optional] ## The list of dimensions & properties associated with the entity. - - include: [Optional] * - - exclude: [Optional] - - name: [Required] ## The name of the dimension - column_name: [Optional] ## The name of the column in the model if not 1:1. Serves as mapping - data_type: [Optional] ## The data type of the dimension - description: [Optional] ## Description of the dimension - default_timestamp: [Optional] ## Setting datetime dimension as default for metrics - time_grains: [Optional] ## Acceptable time grains for the datetime dimension - primary_key: [Optional] ## Whether this dimension is part of the primary key -``` - - - - -Query entities via the Discovery API. - -``` -"entity.project_name.entity_name": { - "unique_id": "entity.project_name.entity_name", - "package_name": "project_name", - "original_file_path": "models/metric_definitions/ratio_metric.yml", - "name": "entity_name", - "model": "ref('model_name')", - "description": "some description", - "dimensions": { - "dimension_name": { - "name": "dimension_name", - "column_name": "column_name", - "default_timestamp": "true", - "time_grains": "[day, week, month, year]" - "primary_key": true, - "data_type": null, - "description": "TBD", - "meta": {}, - } - }, - "resource_type": "entity", - "meta": {}, - "tags": [], - "config": { - "enabled": true, - }, - "depends_on": { - "macros": [], - "nodes": [ - "model.project_name.model_name", - ] - }, - "docs": { - "show": true, - "node_color": null - }, - "refs": [ - [ - "model_name", - ] - ], - "created_at": 1669653016.522599 - }, - ``` - - - - -How to define new [metrics](/docs/build/metrics) in your dbt project. The metric definition and metadata response will change accordingly once entities are introduced, notably with metrics referencing entities instead of models and inheriting entity dimensions. - - ``` - metrics: - ## Always required - - name: [Required] ## The name of the metric - label: [Required] ## The human-readable name of the metric - calculation_method: [Required] ## The calculation/aggregation used for the metric - expression: [Required] ## The SQL expression being aggregated/calculated - entity: [Required] ## The entity being used as the source of the metric - - ## Always optional - description: [Optional] ## Any description about the metric - timestamp: [Optional] ## The name of the timestamp field to use - time_grains: [Optional] ## The list of time grains that are permitted - filters: [Optional] ## The filters of the metric - window: [Optional] ## The ability to make a metric cumulative over a time period - config: [Optional] ## Additional information for configuring the output - - ## Either or dimensions: - include: [Optional] ## The list of dimensions to be included. Either * or list - exclude: [Optional] ## The list of dimensions to be excluded from the inherited list - ``` - - - - - -``` -"metric.project_name.metric_name": { - "fqn": [ - "project_name", - "folder_name", - "metric_name" - ], - "unique_id": "metric.project_name.metric_name", - "package_name": "project_name", - "root_path": "file_path", - "path": "file_path", - "original_file_path": "file_path", - "name": "metric_name", - "description": "description", - "entity": "entity_name", - "label": "Human readable version", - "calculation_method": "the calc method", - "timestamp": "the timestamp field", - "time_grains": [ - "day", - "week" - ], - "expression": "a field name or sql expression", - "dimensions": [ - { - "entity_name": [ - "had_discount", - "order_country" - ] - } - ], - "window": null, - "resource_type": "metric", - "filters": [], - "meta": {}, - "tags": [], - "config": { - "enabled": true - }, - "unrendered_config": {}, - "sources": [], - "depends_on": { - "macros": [], - "nodes": [ - "entity.projet_name.entity_name", - ] - }, - "entities": [ - [ - "entity_name" - ] - ], - "metrics": ["used for derived metrics"], - "created_at": 1669653027.290001 - }, - ``` - - - - -Query an entity using dbt-SQL. Eventually, users will be able to query entities and dynamically generate datasets using a macro (like with metrics), without having to find specific tables or columns. - -``` -select * -from {{ entities.calculate( - entity_list=[...], [Required, one to start] - dimensions: [...], [Optional, default is all] - metrics: [...], [Optional, default is all at finest grain] - filters: ... - )}} - ``` - - - -### dbt Semantic Layer Connector - -In order to support more data platforms and enhance the user experience, users will be able to connect to a [dbt Cloud-supported data platform](/docs/cloud/connect-data-platform/about-connections) with the dbt Semantic Layer. - -Integration partners need to install the [Arrow FlightSQL](https://arrow.apache.org/docs/format/FlightSql.html) JDBC/ODBC driver, which will authenticate with dbt Cloud and the data platform that it queries. - - - - - -### dbt Semantic Layer API - -dbt Cloud will provide a web API that supports: - -- Compiling dbt-SQL queries to return their compiled SQL. -- Executing dbt-SQL queries and returning the queried results from the data platform. - -The API will be a viable integration point with the dbt Semantic Layer. It will be authorized by a [dbt Cloud service token](/docs/dbt-cloud-apis/service-tokens) and eventually support the invocation of dbt commands (e.g., `dbt run`, `dbt test`, etc.) in the future. - - -## Contact us - -### For dbt Semantic Layer support - -For partner and customer support, please email the [Support team](mailto:support@getdbt.com). Please ensure the message includes: - -- "Semantic Layer" -- The name of the partner software -- The dbt Cloud account ID of the customer, if you are a partner making the inquiry - -### For product and partnerships - -If you'd like to become a formal partner, have product feedback/questions, or are interested in integrating, email the [Product and Partnership team](mailto:semantic-layer@dbtlabs.com). - - +- Use the [Discovery API](/docs/dbt-cloud-apis/discovery-api) to enhance the metric and build confidence in its accuracy: + * Check if the metric is fresh and when it was last updated. + * Include lineage information to understand the metric's origin. -## Related docs +- Allow for creating other metadata that’s useful for the metric. We can provide some of this information in our configuration (Display name, Default Granularity for View, Default Time range), but there may be other metadata that your tool wants to provide to make the metric richer. -- [dbt Semantic Layer docs](https://docs.getdbt.com/docs/use-dbt-semantic-layer/dbt-semantic-layer) to learn about the product. -- [dbt Metrics docs](https://docs.getdbt.com/docs/building-a-dbt-project/metrics) for more information about its components. -- [dbt Semantic Layer intro blog](https://www.getdbt.com/blog/dbt-semantic-layer/) and [launch blog](https://www.getdbt.com/blog/frontiers-of-the-dbt-semantic-layer/) to learn more about the product vision and purpose. -- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations. +## Example stages of an integration + +These are recommendations on how to evolve a Semantic Layer integration and not a strict runbook. + +**Stage 1 - The basic** +* Supporting and using [JDBC](/docs/dbt-cloud-apis/sl-jdbc) or [GraphQL](/docs/dbt-cloud-apis/sl-graphql) is the first step. Refer to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) for more technical details. + +**Stage 2 - More discoverability and basic querying** +* Support listing metrics defined in the project +* Listing available dimensions based on one or many metrics +* Querying defined metric values on their own or grouping by available dimensions +* Display metadata from [Discovery API](/docs/dbt-cloud-apis/discovery-api) and other context +**Stage 3 - More querying flexibility and better user experience (UX)** +* More advanced filtering + * Time filters with good presets/calendar UX + * Filtering metrics on a pre-populated set of dimension values +* Make dimension values more user-friendly by organizing them effectively +* Intelligent filtering of metrics based on available dimensions and vice versa +**Stage 4 - More custom user interface (UI) / Collaboration** +* A place where users can see all the relevant information about a given metric +* Organize metrics by hierarchy and more advanced search features (such as filter on the type of metric or other metadata) +* Use and expose more metadata +* Querying dimensions without metrics and other more advanced querying functionality +* Suggest metrics to users based on teams/identity, and so on. + +### A note on transparency and using compile + +For transparency and additional context, we recommend you have an easy way for the user to obtain the SQL that MetricFlow generates. Depending on what API you are using, you can do this by using our compile parameter. This is incredibly powerful because we want to be very transparent to the user about what we're doing and do not want to be a black box. This would be mostly beneficial to a technical user. + + +### A note on where filters + +In the cases where our APIs support either a string or a filter list for the `where` clause, we always recommend that your application utilizes the filter list in order to gain maximum pushdown benefits. The `where` string may be more intuitive for users writing queries during testing, but it will not have the performance benefits of the filter list in a production environment. + +## Related docs + +- [Use the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) to learn about the product. +- [Build your metrics](/docs/build/build-metrics-intro) for more info about MetricFlow and its components. +- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations. diff --git a/website/docs/guides/legacy/best-practices.md b/website/docs/guides/legacy/best-practices.md index 0aad86dd2bc..1fbcbc72cc1 100644 --- a/website/docs/guides/legacy/best-practices.md +++ b/website/docs/guides/legacy/best-practices.md @@ -108,12 +108,10 @@ We often: When developing, it often makes sense to only run the model you are actively working on and any downstream models. You can choose which models to run by using the [model selection syntax](/reference/node-selection/syntax). ### Run only modified models to test changes ("slim CI") -To merge code changes with confidence, you want to know that those changes will not cause breakages elsewhere in your project. For that reason, we recommend running models and tests in a sandboxed environment, separated from your production data, as an automatic check in your git workflow. (If you use GitHub and dbt Cloud, read about [how to set up CI jobs](/docs/deploy/slim-ci-jobs). +To merge code changes with confidence, you want to know that those changes will not cause breakages elsewhere in your project. For that reason, we recommend running models and tests in a sandboxed environment, separated from your production data, as an automatic check in your git workflow. (If you use GitHub and dbt Cloud, read about [how to set up CI jobs](/docs/deploy/ci-jobs). At the same time, it costs time (and money) to run and test all the models in your project. This inefficiency feels especially painful if your PR only proposes changes to a handful of models. -New in v0.18.0 - By comparing to artifacts from a previous production run, dbt can determine which models are modified and build them on top of of their unmodified parents. @@ -122,8 +120,6 @@ dbt run -s state:modified+ --defer --state path/to/prod/artifacts dbt test -s state:modified+ --defer --state path/to/prod/artifacts ``` -New in v1.0.0 - By comparing to artifacts from a previous production run, dbt can determine model and test result statuses. - `result:fail` @@ -159,13 +155,6 @@ dbt test --select result:fail --exclude --defer --state path/to/p > Note: If you're using the `--state target/` flag, `result:error` and `result:fail` flags can only be selected concurrently(in the same command) if using the `dbt build` command. `dbt test` will overwrite the `run_results.json` from `dbt run` in a previous command invocation. - - -Only supported by v1.1 or newer. - - - - Only supported by v1.1 or newer. @@ -184,8 +173,6 @@ dbt source freshness # must be run again to compare current to previous state dbt build --select source_status:fresher+ --state path/to/prod/artifacts ``` - - To learn more, read the docs on [state](/reference/node-selection/syntax#about-node-selection). ## Pro-tips for dbt Projects diff --git a/website/docs/guides/legacy/debugging-schema-names.md b/website/docs/guides/legacy/debugging-schema-names.md index 12daacb1f2d..dee2bc57293 100644 --- a/website/docs/guides/legacy/debugging-schema-names.md +++ b/website/docs/guides/legacy/debugging-schema-names.md @@ -16,7 +16,7 @@ You can also follow along via this video: Do a file search to check if you have a macro named `generate_schema_name` in the `macros` directory of your project. #### I do not have a macro named `generate_schema_name` in my project -This means that you are using dbt's default implementation of the macro, as defined [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L17-L30) +This means that you are using dbt's default implementation of the macro, as defined [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47C1-L60) ```sql {% macro generate_schema_name(custom_schema_name, node) -%} @@ -44,8 +44,7 @@ If your `generate_schema_name` macro looks like so: {{ generate_schema_name_for_env(custom_schema_name, node) }} {%- endmacro %} ``` -Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/include/global_project/macros/etc/get_custom_schema.sql#L43-L56). - +Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47-L60). ```sql {% macro generate_schema_name_for_env(custom_schema_name, node) -%} diff --git a/website/docs/guides/migration/sl-migration.md b/website/docs/guides/migration/sl-migration.md new file mode 100644 index 00000000000..c9def4537a3 --- /dev/null +++ b/website/docs/guides/migration/sl-migration.md @@ -0,0 +1,119 @@ +--- +title: "Legacy dbt Semantic Layer migration guide" +sidebar_label: "Legacy dbt Semantic Layer migration" +description: "Learn how to migrate from the legacy dbt Semantic Layer to the latest one." +tags: [Semantic Layer] +--- + +The legacy Semantic Layer will be deprecated in H2 2023. Additionally, the `dbt_metrics` package will not be supported in dbt v1.6 and later. If you are using `dbt_metrics`, you'll need to upgrade your configurations before upgrading to v1.6. This guide is for people who have the legacy dbt Semantic Layer setup and would like to migrate to the new dbt Semantic Layer. The estimated migration time is two weeks. + + +## Step 1: Migrate metric configs to the new spec + +The metrics specification in dbt Core is changed in v1.6 to support the integration of MetricFlow. It's strongly recommended that you refer to [Build your metrics](/docs/build/build-metrics-intro) and before getting started so you understand the core concepts of the Semantic Layer. + +dbt Labs recommends completing these steps in a local dev environment instead of the IDE: + +1. Create new Semantic Model configs as YAML files in your dbt project.* +1. Upgrade the metrics configs in your project to the new spec.* +1. Delete your old metrics file or remove the `.yml` file extension so they're ignored at parse time. Remove the `dbt-metrics` package from your project. Remove any macros that reference `dbt-metrics`, like `metrics.calculate()`. Make sure that any packages you’re using don't have references to the old metrics spec. +1. Install the CLI with `pip install "dbt-metricflow[your_adapter_name]"`. For example: + + ```bash + pip install "dbt-metricflow[snowflake]" + ``` + **Note** - The MetricFlow CLI is not available in the IDE at this time. Support is coming soon. + +1. Run `dbt parse`. This parses your project and creates a `semantic_manifest.json` file in your target directory. MetricFlow needs this file to query metrics. If you make changes to your configs, you will need to parse your project again. +1. Run `mf list metrics` to view the metrics in your project. +1. Test querying a metric by running `mf query --metrics --group-by `. For example: + ```bash + mf query --metrics revenue --group-by metric_time + ``` +1. Run `mf validate-configs` to run semantic and warehouse validations. This ensures your configs are valid and the underlying objects exist in your warehouse. +1. Push these changes to a new branch in your repo. + +**To make this process easier, dbt Labs provides a [custom migration tool](https://github.com/dbt-labs/dbt-converter) that automates these steps for you. You can find installation instructions in the [README](https://github.com/dbt-labs/dbt-converter/blob/master/README.md). Derived metrics aren’t supported in the migration tool, and will have to be migrated manually.* + +## Step 2: Audit metric values after the migration + +You might need to audit metric values during the migration to ensure that the historical values of key business metrics are the same. + +1. In the CLI, query the metric(s) and dimensions you want to test and include the `--explain` option. For example: + ```bash + mf query --metrics orders,revenue --group-by metric_time__month,customer_type --explain + ``` +1. Use SQL MetricFlow to create a temporary model in your project, like `tmp_orders_revenue audit.sql`. You will use this temporary model to compare against your legacy metrics. +1. If you haven’t already done so, create a model using `metrics.calculate()` for the metrics you want to compare against. For example: + + ```bash + select * + from {{ metrics.calculate( + [metric('orders)', + metric('revenue)'], + grain='week', + dimensions=['metric_time', 'customer_type'], + ) }} + ``` + +1. Run the [dbt-audit](https://github.com/dbt-labs/dbt-audit-helper) helper on both models to compare the metric values. + +## Step 3: Setup the Semantic Layer in a new environment + +This step is only relevant to users who want the legacy and new semantic layer to run in parallel for a short time. This will let you recreate content in downstream tools like Hex and Mode with minimal downtime. If you do not need to recreate assets in these tools skip to step 5. + +1. Create a new deployment environment in dbt Cloud and set the dbt version to 1.6 or higher. +2. Choose `Only run on a custom branch` and point to the branch that has the updated metric definition +3. Set the deployment schema to a temporary migration schema, such as `tmp_sl_migration`. Optional, you can create a new database for the migration. +4. Create a job to parse your project, such as `dbt parse`, and run it. Make sure this job succeeds, There needs to be a successful job in your environment in order to set up the semantic layer +5. In Account Settings > Projects > Project details click `Configure the Semantic Layer`. Under **Environment**select the deployment environment you created in the previous step. Save your configuration. +6. In the Project details page, click `Generate service token` and grant it `Semantic Layer Only` and `Metadata Only` permissions. Save this token securely - you will need it to connect to the semantic layer. + +At this point, both the new semantic layer and the old semantic layer will be running. The new semantic layer will be pointing at your migration branch with the updated metrics definitions. + +## Step 4: Update connection in downstream integrations + +Now that your Semantic Layer is set up, you will need to update any downstream integrations that used the legacy Semantic Layer. + +### Migration guide for Hex + +To learn more about integrating with Hex, check out their [documentation](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration#dbt-semantic-layer-integration) for more info. Additionally, refer to [dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. + +1. Set up a new connection for the Semantic Layer for your account. Something to note is that your old connection will still work. The following Loom video guides you in setting up your Semantic Layer with Hex: + + + +2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer. + +3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + + * **Note** — You will need to update your connection to your production environment once you merge your changes to main. Currently, this connection will be pointing at the semantic layer migration environment + +### Migration guide for Mode + +1. Set up a new connection for the semantic layer for your account. Follow [Mode's docs to setup your connection](https://mode.com/help/articles/supported-databases/#dbt-semantic-layer). + +2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer. + +3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + +## Step 5: Merge your metrics migration branch to main, and upgrade your production environment to 1.6. + +1. Upgrade your production environment to 1.6 or higher. + * **Note** — The old metrics definitions are no longer valid so your dbt jobs will not pass. + +2. Merge your updated metrics definitions to main. **At this point the legacy semantic layer will no longer work.** + +If you created a new environment in [Step 3](#step-3-setup-the-semantic-layer-in-a-new-environment): + +3. Update your Environment in Account Settings > Project Details > Edit Semantic Layer Configuration to point to your production environment + +4. Delete your migration environment. Be sure to update your connection details in any downstream tools to account for the environment change. + +## Related docs + +- [MetricFlow quickstart guide](/docs/build/sl-getting-started) +- [Example dbt project](https://github.com/dbt-labs/jaffle-sl-template) +- [dbt metrics converter](https://github.com/dbt-labs/dbt-converter) +- [Why we're deprecating the dbt_metrics package](/blog/deprecating-dbt-metrics) blog post +- [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) diff --git a/website/docs/guides/migration/tools/refactoring-legacy-sql.md b/website/docs/guides/migration/tools/refactoring-legacy-sql.md index 9dd66abb495..d9acfea6dab 100644 --- a/website/docs/guides/migration/tools/refactoring-legacy-sql.md +++ b/website/docs/guides/migration/tools/refactoring-legacy-sql.md @@ -59,7 +59,7 @@ This allows you to call the same table in multiple places with `{{ src('my_sourc We start here for several reasons: #### Source freshness reporting -Using sources unlocks the ability to run [source freshness reporting](docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale. +Using sources unlocks the ability to run [source freshness reporting](/docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale. #### Easy dependency tracing If you're migrating multiple stored procedures into dbt, with sources you can see which queries depend on the same raw tables. diff --git a/website/docs/guides/migration/versions/00-upgrading-to-v1.7.md b/website/docs/guides/migration/versions/00-upgrading-to-v1.7.md new file mode 100644 index 00000000000..036c734dfb1 --- /dev/null +++ b/website/docs/guides/migration/versions/00-upgrading-to-v1.7.md @@ -0,0 +1,24 @@ +--- +title: "Upgrading to v1.7 (beta)" +description: New features and changes in dbt Core v1.7 +--- + +## Resources + +- [Changelog](https://github.com/dbt-labs/dbt-core/blob/8aaed0e29f9560bc53d9d3e88325a9597318e375/CHANGELOG.md) +- [CLI Installation guide](/docs/core/installation) +- [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) +- [Release schedule](https://github.com/dbt-labs/dbt-core/issues/7481) + +## What to know before upgrading + +dbt Labs is committed to providing backward compatibility for all versions 1.x, with the exception of any changes explicitly mentioned below. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). + +### Behavior changes + +**COMING SOON** + +### Quick hits + +**COMING SOON** + diff --git a/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md b/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md index cb1e9af603d..bdb47bbf2ea 100644 --- a/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md +++ b/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md @@ -1,15 +1,12 @@ --- -title: "Upgrading to v1.6 (prerelease)" +title: "Upgrading to v1.6 (latest)" description: New features and changes in dbt Core v1.6 --- -:::warning Prerelease - -dbt Core v1.6 is available as a release candidate. [Final release is planned for July 31.](https://github.com/dbt-labs/dbt-core/issues/7990) - -Test it out, and [let us know](https://github.com/dbt-labs/dbt-core/issues/new/choose) if you run into any issues! - -::: +dbt Core v1.6 has three significant areas of focus: +1. Next milestone of [multi-project deployments](https://github.com/dbt-labs/dbt-core/discussions/6725): improvements to contracts, groups/access, versions; and building blocks for cross-project `ref` +1. Semantic layer re-launch: dbt Core and [MetricFlow](https://docs.getdbt.com/docs/build/about-metricflow) integration +1. Mechanisms to support mature deployment at scale (`dbt clone` and `dbt retry`) ## Resources @@ -24,13 +21,22 @@ dbt Labs is committed to providing backward compatibility for all versions 1.x, ### Behavior changes +:::info Action required if your project defines `metrics` + +The [spec for metrics](https://github.com/dbt-labs/dbt-core/discussions/7456) has changed and now uses [MetricFlow](/docs/build/about-metricflow). + +::: + +If your dbt project defines metrics, you must migrate to dbt v1.6 because the YAML spec has moved from dbt_metrics to MetricFlow. Any tests you have won't compile on v1.5 or older. + - dbt Core v1.6 does not support Python 3.7, which reached End Of Life on June 23. Support Python versions are 3.8, 3.9, 3.10, and 3.11. -- As part of the Semantic layer re-launch (in beta), the spec for `metrics` has changed significantly. Migration guide coming soon: https://github.com/dbt-labs/docs.getdbt.com/pull/3705 -- Manifest schema version is now v10, reflecting [TODO] changes +- As part of the [dbt Semantic layer](/docs/use-dbt-semantic-layer/dbt-sl) re-launch (in beta), the spec for `metrics` has changed significantly. Refer to the [migration guide](/guides/migration/sl-migration) for more info on how to migrate to the re-launched dbt Semantic Layer. +- The manifest schema version is now v10. +- dbt Labs is ending support for Homebrew installation of dbt-core and adapters. See [the discussion](https://github.com/dbt-labs/dbt-core/discussions/8277) for more details. ### For consumers of dbt artifacts (metadata) -The [manifest](/reference/artifacts/manifest-json) schema version has updated to `v10`. Specific changes: +The [manifest](/reference/artifacts/manifest-json) schema version has been updated to `v10`. Specific changes: - Addition of `semantic_models` and changes to `metrics` attributes - Addition of `deprecation_date` as a model property - Addition of `on_configuration_change` as default node configuration (to support materialized views) @@ -43,14 +49,19 @@ For more detailed information and to ask questions, please read and comment on t ## New and changed documentation +### MetricFlow + +- [**Build your metrics**](/docs/build/build-metrics-intro) with MetricFlow, a key component of the dbt Semantic Layer. You can define your metrics and build semantic models with MetricFlow, available on the command line (CLI) for dbt Core v1.6 beta or higher. + ### Materialized views Supported on: - [Postgres](/reference/resource-configs/postgres-configs#materialized-view) - [Redshift](/reference/resource-configs/redshift-configs#materialized-view) -- Snowflake (docs forthcoming) +- [Snowflake](/reference/resource-configs/snowflake-configs#dynamic-tables) +- Databricks (docs forthcoming) -Support for BigQuery and Databricks forthcoming. +Support for BigQuery coming soon. ### New commands for mature deployment @@ -79,3 +90,4 @@ More consistency and flexibility around packages. Resources defined in a package - [`dbt debug --connection`](/reference/commands/debug) to test just the data platform connection specified in a profile - [`dbt docs generate --empty-catalog`](/reference/commands/cmd-docs) to skip catalog population while generating docs - [`--defer-state`](/reference/node-selection/defer) enables more-granular control + diff --git a/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md b/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md index 811b57e6a33..0c7fc7ebcad 100644 --- a/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md +++ b/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md @@ -1,5 +1,5 @@ --- -title: "Upgrading to v1.5 (latest)" +title: "Upgrading to v1.5" description: New features and changes in dbt Core v1.5 --- @@ -56,7 +56,57 @@ models: tests: [] # todo! add tests later config: ... ``` -Some options that could previously be specified before a sub-command can now only be specified afterward. For example, `dbt --profiles-dir . run` isn't valid anymore, and instead, you need to use `dbt run --profiles-dir .` + +Some options that could previously be specified _after_ a subcommand can now only be specified _before_. This includes the inverse of the option, `--write-json` and `--no-write-json`, for example. The list of affected options are: + +
      +List of affected options + +```bash +--cache-selected-only | --no-cache-selected-only +--debug, -d | --no-debug +--deprecated-print | --deprecated-no-print +--enable-legacy-logger | --no-enable-legacy-logger +--fail-fast, -x | --no-fail-fast +--log-cache-events | --no-log-cache-events +--log-format +--log-format-file +--log-level +--log-level-file +--log-path +--macro-debugging | --no-macro-debugging +--partial-parse | --no-partial-parse +--partial-parse-file-path +--populate-cache | --no-populate-cache +--print | --no-print +--printer-width +--quiet, -q | --no-quiet +--record-timing-info, -r +--send-anonymous-usage-stats | --no-send-anonymous-usage-stats +--single-threaded | --no-single-threaded +--static-parser | --no-static-parser +--use-colors | --no-use-colors +--use-colors-file | --no-use-colors-file +--use-experimental-parser | --no-use-experimental-parser +--version, -V, -v +--version-check | --no-version-check +--warn-error +--warn-error-options +--write-json | --no-write-json + +``` + +
      + + +Additionally, some options that could be previously specified _before_ a subcommand can now only be specified _after_. Any option _not_ in the above list must appear _after_ the subcommand from v1.5 and later. For example, `--profiles-dir`. + + +The built-in [collect_freshness](https://github.com/dbt-labs/dbt-core/blob/1.5.latest/core/dbt/include/global_project/macros/adapters/freshness.sql) macro now returns the entire `response` object, instead of just the `table` result. If you're using a custom override for `collect_freshness`, make sure you're also returning the `response` object; otherwise, some of your dbt commands will never finish. For example: + +```sql +{{ return(load_result('collect_freshness')) }} +``` Finally: The [built-in `generate_alias_name` macro](https://github.com/dbt-labs/dbt-core/blob/1.5.latest/core/dbt/include/global_project/macros/get_custom_name/get_custom_alias.sql) now includes logic to handle versioned models. If your project has reimplemented the `generate_alias_name` macro with custom logic, and you want to start using [model versions](/docs/collaborate/govern/model-versions), you will need to update the logic in your macro. Note that, while this is **not** a prerequisite for upgrading to v1.5—only for using the new feature—we recommmend that you do this during your upgrade, whether you're planning to use model versions tomorrow or far in the future. diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md index a377554c317..d453106eead 100644 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md +++ b/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md @@ -15,17 +15,17 @@ In some cases, [Airflow](https://airflow.apache.org/) may be the preferred orche ### Airflow + dbt Core -There are so many great examples from Gitlab through their open source data engineering work. Example: [here](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py). This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details: [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/) +There are [so many great examples](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py) from GitLab through their open source data engineering work. This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/). ### Airflow + dbt Cloud API w/Custom Scripts -This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available: [here](https://registry.astronomer.io/providers/dbt-cloud?type=Sensors&utm_campaign=Monthly%20Product%20Updates&utm_medium=email&_hsmi=208603877&utm_content=208603877&utm_source=hs_email) +This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available [here](https://registry.astronomer.io/providers/dbt-cloud?type=Sensors&utm_campaign=Monthly%20Product%20Updates&utm_medium=email&_hsmi=208603877&utm_content=208603877&utm_source=hs_email). There are many different permutations of this over time: -- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on custom python API utilities [here](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py) +- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on [custom python API utilities](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py) - [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want -- [Other ways to run dbt in airflow](/docs/deploy/deployments#airflow): Official dbt Docs on how teams are running dbt in airflow +- For more options, check out the [official dbt Docs](/docs/deploy/deployments#airflow) on the various ways teams are running dbt in airflow ## This guide's process diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md b/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md index 048fe637de0..a66259c6c49 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md @@ -1,10 +1,8 @@ --- -title: Customizing CI/CD +title: Customizing CI/CD with Custom Pipelines id: 1-cicd-background --- -# Creating Custom CI/CD Pipelines - One of the core tenets of dbt is that analytic code should be version controlled. This provides a ton of benefit to your organization in terms of collaboration, code consistency, stability, and the ability to roll back to a prior version. There’s an additional benefit that is provided with your code hosting platform that is often overlooked or underutilized. Some of you may have experience using dbt Cloud’s [webhook functionality](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) to run a job when a PR is created. This is a fantastic capability, and meets most use cases for testing your code before merging to production. However, there are circumstances when an organization needs additional functionality, like running workflows on every commit (linting), or running workflows after a merge is complete. In this article, we will show you how to setup custom pipelines to lint your project and trigger a dbt Cloud job via the API. A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch. diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md b/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md index d618f9eec64..d22d1d14284 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md @@ -1,11 +1,11 @@ --- -title: Run a dbt Cloud job on merge +title: Run a dbt Cloud job on merge id: 3-dbt-cloud-job-on-merge --- This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts presented here can be generalized and used in whatever way best suits your use case. -The setup below shows how to call the dbt Cloud API to run a job every time there is a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently). +The setup below shows how to call the dbt Cloud API to run a job every time there's a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently). ### 1. Get your dbt Cloud API key diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md b/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md index 8a6f8965b87..b58bab175b3 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md @@ -5,16 +5,16 @@ id: 4-dbt-cloud-job-on-pr :::info Run on PR -If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/slim-ci-jobs). +If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/ci-jobs). This section is only for those projects that connect to their git repository using an SSH key. ::: -If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of Slim CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created. +If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created. -The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-3 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)** +The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-5 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)** -### 4. Create a pipeline job that runs when PRs are created +### 6. Create a pipeline job that runs when PRs are created -### 5. Confirm the pipeline runs +### 7. Confirm the pipeline runs Now that you have a new pipeline, it's time to run it and make sure it works. Since this only triggers when a PR is created, you'll need to create a new PR on a branch that contains the code above. Once you do that, you should see a pipeline that looks like this: @@ -83,13 +83,13 @@ dbt Cloud job: -### 6. Handle those extra schemas in your database +### 8. Handle those extra schemas in your database As noted above, when the PR job runs it will create a new schema based on the PR. To avoid having your database overwhelmed with PR schemas, consider adding a "cleanup" job to your dbt Cloud account. This job can run on a scheduled basis to cleanup any PR schemas that haven't been updated/used recently. Add this as a macro to your project. It takes 2 arguments that lets you control which schema get dropped: - `age_in_days`: The number of days since the schema was last altered before it should be dropped (default 10 days) - - `databse_to_clean`: The name of the database to remove schemas from + - `database_to_clean`: The name of the database to remove schemas from ```sql {# @@ -128,4 +128,4 @@ Add this as a macro to your project. It takes 2 arguments that lets you control This macro goes into a dbt Cloud job that is run on a schedule. The command will look like this (text below for copy/paste): ![dbt Cloud job showing the run operation command for the cleanup macro](/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png) -`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"` \ No newline at end of file +`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"` diff --git a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md b/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md index 692106655ac..bb1045b3d2f 100644 --- a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md +++ b/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md @@ -9,14 +9,14 @@ hoverSnippet: Learn how to use Databricks workflows to run dbt Cloud jobs Using Databricks workflows to call the dbt Cloud job API can be useful for several reasons: 1. **Integration with other ETL processes** — If you're already running other ETL processes in Databricks, you can use a Databricks workflow to trigger a dbt Cloud job after those processes are done. -2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/dbt-cloud-job). +2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/deploy-jobs). 3. [**Separation of concerns —**](https://en.wikipedia.org/wiki/Separation_of_concerns) Detailed logs for dbt jobs in the dbt Cloud environment can lead to more modularity and efficient debugging. By doing so, it becomes easier to isolate bugs quickly while still being able to see the overall status in Databricks. 4. **Custom job triggering —** Use a Databricks workflow to trigger dbt Cloud jobs based on custom conditions or logic that aren't natively supported by dbt Cloud's scheduling feature. This can give you more flexibility in terms of when and how your dbt Cloud jobs run. ## Prerequisites - Active [Teams or Enterprise dbt Cloud account](https://www.getdbt.com/pricing/) -- You must have a configured and existing [dbt Cloud job](/docs/deploy/dbt-cloud-job) +- You must have a configured and existing [dbt Cloud deploy job](/docs/deploy/deploy-jobs) - Active Databricks account with access to [Data Science and Engineering workspace](https://docs.databricks.com/workspace-index.html) and [Manage secrets](https://docs.databricks.com/security/secrets/index.html) - [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) - **Note**: You only need to set up your authentication. Once you have set up your Host and Token and are able to run `databricks workspace ls /Users/`, you can proceed with the rest of this guide. diff --git a/website/docs/guides/orchestration/set-up-ci/1-introduction.md b/website/docs/guides/orchestration/set-up-ci/1-introduction.md new file mode 100644 index 00000000000..97df16b4ce1 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/1-introduction.md @@ -0,0 +1,10 @@ +--- +title: "Get started with Continuous Integration tests" +slug: overview +--- + +By validating your code _before_ it goes into production, you don't need to spend your afternoon fielding messages from people whose reports are suddenly broken. + +A solid CI setup is critical to preventing avoidable downtime and broken trust. dbt Cloud uses **sensible defaults** to get you up and running in a performant and cost-effective way in minimal time. + +After that, there's time to get fancy, but let's walk before we run. diff --git a/website/docs/guides/orchestration/set-up-ci/2-quick-setup.md b/website/docs/guides/orchestration/set-up-ci/2-quick-setup.md new file mode 100644 index 00000000000..9b6d46fe2b2 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/2-quick-setup.md @@ -0,0 +1,50 @@ +--- +title: "Baseline: Enable CI in 15 minutes" +slug: in-15-minutes +description: Find issues before they are deployed to production with dbt Cloud's Slim CI. +--- + +In this guide, we're going to add a **CI environment**, where proposed changes can be validated in the context of the entire project without impacting production systems. We will use a single set of deployment credentials (like the Prod environment), but models are built in a separate location to avoid impacting others (like the Dev environment). + +Your git flow will look like this: + + +## Prerequisites + +As part of your initial dbt Cloud setup, you should already have Development and Production environments configured. Let's recap what each does: + +- Your **Development environment** powers the IDE. Each user has individual credentials, and builds into an individual dev schema. Nothing you do here impacts any of your colleagues. +- Your **Production environment** brings the canonical version of your project to life for downstream consumers. There is a single set of deployment credentials, and everything is built into your production schema(s). + +## Step 1: Create a new CI environment + +See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **CI**. Just like your existing Production environment, it will be a Deployment-type environment. + +When setting a Schema in the **Deployment Credentials** area, remember that dbt Cloud will automatically generate a custom schema name for each PR to ensure that they don't interfere with your deployed models. This means you can safely set the same Schema name as your Production job. + +## Step 2: Double-check your Production environment is identified + +Go into your existing Production environment, and ensure that the **Set as Production environment** checkbox is set. It'll make things easier later. + +## Step 3: Create a new job in the CI environment + +Use the **Continuous Integration Job** template, and call the job **CI Check**. + +In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down: + +- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped. +- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs. + +To be able to find modified nodes, dbt needs to have something to compare against. dbt Cloud uses the last successful run of any job in your Production environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). As long as you identified your Production environment in Step 2, you won't need to touch this. If you didn't, pick the right environment from the dropdown. + +## Step 4: Test your process + +That's it! There are other steps you can take to be even more confident in your work, such as [validating your structure follows best practices](/guides/orchestration/set-up-ci/run-dbt-project-evaluator) and [linting your code](/guides/orchestration/set-up-ci/lint-on-push), but this covers the most critical checks. + +To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft). Within a few seconds, you’ll see a new check appear in your git provider. + +## Things to keep in mind + +- If you make a new commit while a CI run based on older code is in progress, it will be automatically canceled and replaced with the fresh code. +- An unlimited number of CI jobs can run at once. If 10 developers all commit code to different PRs at the same time, each person will get their own schema containing their changes. Once each PR is merged, dbt Cloud will drop that schema. +- CI jobs will never block a production run. diff --git a/website/docs/guides/orchestration/set-up-ci/3-run-dbt-project-evaluator.md b/website/docs/guides/orchestration/set-up-ci/3-run-dbt-project-evaluator.md new file mode 100644 index 00000000000..646a9cb42b7 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/3-run-dbt-project-evaluator.md @@ -0,0 +1,46 @@ +--- +title: "Enforce best practices with dbt project evaluator" +slug: run-dbt-project-evaluator +description: dbt Project Evaluator can be run from inside of your existing dbt Cloud CI job to identify common flaws in projects. +--- + +dbt Project Evaluator is a package designed to identify deviations from best practices common to many dbt projects, including modeling, testing, documentation, structure and performance problems. For an introduction to the package, read its [launch blog post](/blog/align-with-dbt-project-evaluator). + +## Step 1: Install the package + +As with all packages, add a reference to `dbt-labs/dbt_project_evaluator` to your `packages.yml` file. See the [dbt Package Hub](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest/) for full installation instructions. + +## Step 2: Define test severity with an environment variable + +As noted in the [documentation](https://dbt-labs.github.io/dbt-project-evaluator/latest/ci-check/), tests in the package are set to `warn` severity by default. + +To have these tests fail in CI, create a new environment called `DBT_PROJECT_EVALUATOR_SEVERITY`. Set the project-wide default to `warn`, and set it to `error` in the CI environment. + +In your `dbt_project.yml` file, override the severity configuration: + +```yaml +tests: +dbt_project_evaluator: + +severity: "{{ env_var('DBT_PROJECT_EVALUATOR_SEVERITY', 'warn') }}" +``` + +## Step 3: Update your CI commands + +Because these tests should only run after the rest of your project has been built, your existing CI command will need to be updated to exclude the dbt_project_evaluator package. You will then add a second step which builds _only_ the package's models and tests. + +Update your steps to: + +```bash +dbt build --select state:modified+ --exclude package:dbt_project_evaluator +dbt build --select package:dbt_project_evaluator +``` + +## Step 4: Apply any customizations + +Depending on the state of your project when you roll out the evaluator, you may need to skip some tests or allow exceptions for some areas. To do this, refer to the documentation on: + +- [disabling tests](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/customization/) +- [excluding groups of models from a specific test](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/exceptions/) +- [excluding packages or sources/models based on path](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/excluding-packages-and-paths/) + +If you create a seed to exclude groups of models from a specific test, remember to disable the default seed and include `dbt_project_evaluator_exceptions` in your second `dbt build` command above. diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md b/website/docs/guides/orchestration/set-up-ci/4-lint-on-push.md similarity index 55% rename from website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md rename to website/docs/guides/orchestration/set-up-ci/4-lint-on-push.md index dea3b5b5de3..1932ffe1019 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md +++ b/website/docs/guides/orchestration/set-up-ci/4-lint-on-push.md @@ -1,11 +1,12 @@ --- -title: Lint code on push -id: 2-lint-on-push +title: "Run linting checks with SQLFluff" +slug: lint-on-push +description: Enforce your organization's SQL style guide with by running SQLFluff in your git workflow whenever new code is pushed. --- -This section shows a very basic example of linting a project every time a commit is pushed to the repo. While it is simple, it shows the power of CI and can be expanded on to meet the needs of your organization. +By [linting](/docs/cloud/dbt-cloud-ide/lint-format#lint) your project during CI, you can ensure that code styling standards are consistently enforced, without spending human time nitpicking comma placement. -The steps below use [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. In the example, it's set to use the `snowflake` dialect, and specifically runs the rules L019, L020, L021, and L022. This is purely for demonstration purposes. You should update this to reflect your code base's [dialect](https://docs.sqlfluff.com/en/stable/dialects.html) and the [rules](https://docs.sqlfluff.com/en/stable/rules.html) you've established for your repo. +The steps below create an action/pipeline which uses [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. If you don't already have SQLFluff rules defined, check out [our recommended config file](/guides/best-practices/how-we-style/2-how-we-style-our-sql). ### 1. Create a YAML file to define your pipeline @@ -21,8 +22,8 @@ The YAML files defined below are what tell your code hosting platform the steps }> -In order for GitHub to know that you want to run an action, you need to have a few specific folders in your project. Add a new folder named `.github`, and within that folder add a new one named `workflows`. Your final folder structure will look like this: - +GitHub Actions are defined in the `.github/workflows` directory. To define the job for your action, add a new file named `lint_on_push.yml` under the `workflows` folder. Your final folder structure will look like this: + ```sql my_awesome_project ├── .github @@ -30,16 +31,14 @@ my_awesome_project │ │ └── lint_on_push.yml ``` -To define the job for our action, let’s add a new file named `lint_on_push.yml` under the `workflows` folder. This file is how we tell the GitHub runner what to execute when the job is triggered. - -Below I touch on the important pieces for running a dbt Cloud job, but if you want a full run-down of all the components of this YAML file checkout [this GitHub article](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on actions. - **Key pieces:** -- `on:` - this is used to filter when the pipeline is run. In this example we’re running it on every push except for pushes to branches named `main`. For more filters, checkout [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). +- `on:` defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other trigger options, check out [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). - `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job -- `uses:` - remember the virtual servers we coved in the background section? They’re just empty operating systems, so there are two pieces of setup that are needed in order to access the code in your repo, and setup Python correctly on the virtual server. These two actions are called from other repos in GitHub to provide those services. For more information on them, checkout their repos: [actions/checkout](https://github.com/actions/checkout#checkout-v3) and [actions/setup-python](https://github.com/actions/setup-python#setup-python-v3). -- `run:` - this is how we’re telling the GitHub runner to execute the Python script we defined above. +- `uses:` - When the Ubuntu server is created, it is completely empty. [`checkout`](https://github.com/actions/checkout#checkout-v3) and [`setup-python`](https://github.com/actions/setup-python#setup-python-v3) are public GitHub Actions which enable the server to access the code in your repo, and set up Python correctly. +- `run:` - these steps are run at the command line, as though you typed them at a prompt yourself. This will install sqlfluff and lint the project. Be sure to set the correct `--dialect` for your project. + +For a full breakdown of the properties in a workflow file, see [Understanding the workflow file](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on GitHub's website. ```yaml name: lint dbt project on push @@ -50,7 +49,7 @@ on: - 'main' jobs: -# this job runs SQLFluff with a specific set of rules + # this job runs SQLFluff with a specific set of rules # note the dialect is set to Snowflake, so make that specific to your setup # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html lint_project: @@ -63,9 +62,9 @@ jobs: with: python-version: "3.9" - name: Install SQLFluff - run: "pip install sqlfluff==0.13.1" + run: "pip install sqlfluff" - name: Lint project - run: "sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022" + run: "sqlfluff lint models --dialect snowflake" ``` @@ -83,7 +82,7 @@ my_awesome_project **Key pieces:** - `image: python:3.9` - this defines the virtual image we’re using to run the job -- `rules:` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event except when the branch is named `main`. Filters are very powerful to run commands on specific events, and you can find a full list in [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules). +- `rules:` - defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other rules, refer to [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules). - `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above. ```yaml @@ -100,8 +99,8 @@ lint-project: rules: - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main' script: - - pip install sqlfluff==0.13.1 - - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 + - pip install sqlfluff + - sqlfluff lint models --dialect snowflake ``` @@ -118,7 +117,7 @@ my_awesome_project **Key pieces:** - `image: python:3.11.1` - this defines the virtual image we’re using to run the job -- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `master`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/) +- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `main`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/) - `script:` - this is how we’re telling the Bitbucket runner to execute the Python script we defined above. ```yaml @@ -134,7 +133,7 @@ pipelines: - pip install sqlfluff==0.13.1 - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 - 'master': # override if your default branch doesn't run on a branch named "master" + 'main': # override if your default branch doesn't run on a branch named "main" - step: script: - python --version @@ -145,7 +144,7 @@ pipelines: ### 2. Commit and push your changes to make sure everything works -After you finish creating the YAML files, commit and push your code. Doing this will trigger your pipeline for the first time! If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run. +After you finish creating the YAML files, commit and push your code to trigger your pipeline for the first time. If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run. + +## Prerequisites + +- You have the **Development**, **CI**, and **Production** environments, as described in [the Baseline setup](/guides/orchestration/set-up-ci/in-15-minutes). + + +## Step 1: Create a `release` branch in your git repo + +As noted above, this branch will outlive any individual feature, and will be the base of all feature development for a period of time. Your team might choose to create a new branch for each sprint (`qa/sprint-01`, `qa/sprint-02`, etc), tie it to a version of your data product (`qa/1.0`, `qa/1.1`), or just have a single `qa` branch which remains active indefinitely. + +## Step 2: Update your Development environment to use the `qa` branch + +See [Custom branch behavior](/docs/dbt-cloud-environments#custom-branch-behavior). Setting `qa` as your custom branch ensures that the IDE creates new branches and PRs with the correct target, instead of using `main`. + + + +## Step 3: Create a new QA environment + +See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **QA**. Just like your existing Production and CI environments, it will be a Deployment-type environment. + +Set its branch to `qa` as well. + +## Step 4: Create a new job + +Use the **Continuous Integration Job** template, and call the job **QA Check**. + +In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down: + +- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped. +- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs. + +To be able to find modified nodes, dbt needs to have something to compare against. Normally, we use the Production environment as the source of truth, but in this case there will be new code merged into `qa` long before it hits the `main` branch and Production environment. Because of this, we'll want to defer the Release environment to itself. + +### Optional: also add a compile-only job + +dbt Cloud uses the last successful run of any job in that environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). If you have a lot of PRs in flight, the comparison state could switch around regularly. + +Adding a regularly-scheduled job inside of the QA environment whose only command is `dbt compile` can regenerate a more stable manifest for comparison purposes. + +## Step 5: Test your process + +When the Release Manager is ready to cut a new release, they will manually open a PR from `qa` into `main` from their git provider (e.g. GitHub, GitLab, Azure DevOps). dbt Cloud will detect the new PR, at which point the existing check in the CI environment will trigger and run. When using the [baseline configuration](/guides/orchestration/set-up-ci/in-15-minutes), it's possible to kick off the PR creation from inside of the dbt Cloud IDE. Under this paradigm, that button will create PRs targeting your QA branch instead. + +To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft) against your `qa` branch. You'll see the integration tests begin to run. Once they complete, manually create a PR against `main`, and within a few seconds you’ll see the tests run again but this time incorporating all changes from all code that hasn't been merged to main yet. diff --git a/website/docs/guides/orchestration/webhooks/serverless-datadog.md b/website/docs/guides/orchestration/webhooks/serverless-datadog.md index cb03c72c6b5..6bd38869259 100644 --- a/website/docs/guides/orchestration/webhooks/serverless-datadog.md +++ b/website/docs/guides/orchestration/webhooks/serverless-datadog.md @@ -5,7 +5,7 @@ slug: serverless-datadog description: Configure a serverless app to add Datadog logs --- -This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-models). +This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-job-models). In this example, we will use [fly.io](https://fly.io) for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run). @@ -24,7 +24,7 @@ This guide assumes some familiarity with: ### 2. Install `flyctl` and sign up for fly.io -Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: +Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: Switch to the directory containing the repo you cloned in step 1: ```shell @@ -48,11 +48,11 @@ Launching your app publishes it to the web and makes it ready to catch webhook e flyctl launch ``` -You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app. +You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app. Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-datadog`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes. -Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`). +Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`). When asked if you would like to set up Postgresql or Redis databases, type `n` for each. @@ -108,4 +108,4 @@ flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 DD ``` ### 7. Deploy your app -After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. \ No newline at end of file +After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. diff --git a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md b/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md index aa95b999d4c..bb3f03ef0c0 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md +++ b/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md @@ -27,7 +27,7 @@ In order to set up the integration, you should have familiarity with: **Note**: To receive the message, add the Zapier app to the team's channel during installation. ### 2. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. diff --git a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md b/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md index 49b01d0db7e..0764c6c7911 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md +++ b/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md @@ -16,7 +16,7 @@ In order to set up the integration, you should have familiarity with: ## Integration steps ### 1. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md b/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md index 99680c432b3..f682baae8e2 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md +++ b/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md @@ -22,7 +22,7 @@ In order to set up the integration, you should have familiarity with: ## Integration steps ### 1. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md b/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md index 8751528565c..52a9ae63523 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md +++ b/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md @@ -25,7 +25,7 @@ To set up the integration, you need to be familiar with: To authenticate with the Tableau API, obtain a [Personal Access Token](https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm) from your Tableau Server/Cloud instance. In addition, make sure your Tableau workbook uses data sources that allow refresh access, which is usually set when publishing. ### 2. Create a new Zap in Zapier -To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead. +To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead. Press **Continue**, then copy the webhook URL. diff --git a/website/docs/guides/orchestration/webhooks/zapier-slack.md b/website/docs/guides/orchestration/webhooks/zapier-slack.md index d3b0473502b..c9046ee9943 100644 --- a/website/docs/guides/orchestration/webhooks/zapier-slack.md +++ b/website/docs/guides/orchestration/webhooks/zapier-slack.md @@ -25,7 +25,7 @@ In order to set up the integration, you should have familiarity with: ## Integration steps ### 1. Create a new Zap in Zapier -Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. Click **Continue**, then copy the webhook URL. diff --git a/website/docs/quickstarts/bigquery-qs.md b/website/docs/quickstarts/bigquery-qs.md index 84e3b3ae545..7f7f9aa7655 100644 --- a/website/docs/quickstarts/bigquery-qs.md +++ b/website/docs/quickstarts/bigquery-qs.md @@ -33,8 +33,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen ### Related content - Learn more with [dbt Courses](https://courses.getdbt.com/collections) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) diff --git a/website/docs/quickstarts/databricks-qs.md b/website/docs/quickstarts/databricks-qs.md index 1222ef2a7d5..08334862517 100644 --- a/website/docs/quickstarts/databricks-qs.md +++ b/website/docs/quickstarts/databricks-qs.md @@ -30,8 +30,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen ### Related content - Learn more with [dbt Courses](https://courses.getdbt.com/collections) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) diff --git a/website/docs/quickstarts/manual-install-qs.md b/website/docs/quickstarts/manual-install-qs.md index ea3c6c7ec84..05336178ff6 100644 --- a/website/docs/quickstarts/manual-install-qs.md +++ b/website/docs/quickstarts/manual-install-qs.md @@ -18,11 +18,11 @@ When you use dbt Core to work with dbt, you will be editing files locally using * Complete [Setting up (in BigQuery)](/quickstarts/bigquery?step=2) and [Loading data (BigQuery)](/quickstarts/bigquery?step=3). * [Create a GitHub account](https://github.com/join) if you don't already have one. -## Create a starter project +### Create a starter project After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models. -### Create a repository +## Create a repository The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join). @@ -32,7 +32,7 @@ The following steps use [GitHub](https://github.com/) as the Git provider for th 4. Click **Create repository**. 5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes). -### Create a project +## Create a project Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project. @@ -40,56 +40,56 @@ To create your dbt project: 1. Make sure you have dbt Core installed and check the version using the `dbt --version` command: - ```terminal - dbt --version - ``` +```shell +dbt --version +``` 2. Initiate the `jaffle_shop` project using the `init` command: - ```terminal - dbt init jaffle_shop - ``` +```shell +dbt init jaffle_shop +``` 3. Navigate into your project's directory: - ```terminal - cd jaffle_shop - ``` +```shell +cd jaffle_shop +``` 4. Use `pwd` to confirm that you are in the right spot: - ```terminal - $ pwd - > Users/BBaggins/dbt-tutorial/jaffle_shop - ``` +```shell +$ pwd +> Users/BBaggins/dbt-tutorial/jaffle_shop +``` 5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command. -
      - -
      +
      + +
      6. Update the following values in the `dbt_project.yml` file: - + - ```yaml - name: jaffle_shop # Change from the default, `my_new_project` +```yaml +name: jaffle_shop # Change from the default, `my_new_project` - ... +... - profile: jaffle_shop # Change from the default profile name, `default` +profile: jaffle_shop # Change from the default profile name, `default` - ... +... - models: - jaffle_shop: # Change from `my_new_project` to match the previous value for `name:` - ... - ``` +models: + jaffle_shop: # Change from `my_new_project` to match the previous value for `name:` + ... +``` - + -### Connect to BigQuery +## Connect to BigQuery When developing locally, dbt connects to your using a [profile](/docs/core/connect-data-platform/connection-profiles), which is a YAML file with all the connection details to your warehouse. @@ -97,38 +97,38 @@ When developing locally, dbt connects to your using 2. Move your BigQuery keyfile into this directory. 3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted. - - - ```yaml - jaffle_shop: # this needs to match the profile in your dbt_project.yml file - target: dev - outputs: - dev: - type: bigquery - method: service-account - keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile - project: grand-highway-265418 # Replace this with your project id - dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo - threads: 1 - timeout_seconds: 300 - location: US - priority: interactive - ``` - - + + +```yaml +jaffle_shop: # this needs to match the profile in your dbt_project.yml file + target: dev + outputs: + dev: + type: bigquery + method: service-account + keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile + project: grand-highway-265418 # Replace this with your project id + dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo + threads: 1 + timeout_seconds: 300 + location: US + priority: interactive +``` + + 4. Run the `debug` command from your project to confirm that you can successfully connect: - ```terminal - $ dbt debug - > Connection test: OK connection ok - ``` +```shell +$ dbt debug +> Connection test: OK connection ok +``` -
      - -
      +
      + +
      -#### FAQs +### FAQs @@ -136,69 +136,72 @@ When developing locally, dbt connects to your using -### Perform your first dbt run +## Perform your first dbt run Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order. 1. Enter the `run` command to build example models: - ```terminal - dbt run - ``` +```shell +dbt run +``` You should have an output that looks like this: +
      -### Commit your changes +## Commit your changes Commit your changes so that the repository contains the latest code. 1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository). - ```terminal - git init - git branch -M main - git add . - git commit -m "Create a dbt project" - git remote add origin https://github.com/USERNAME/dbt-tutorial.git - git push -u origin main - ``` +```shell +git init +git branch -M main +git add . +git commit -m "Create a dbt project" +git remote add origin https://github.com/USERNAME/dbt-tutorial.git +git push -u origin main +``` 2. Return to your GitHub repository to verify your new files have been added. -## Build your first models +### Build your first models -Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! You will take a sample query and turn it into a model in your dbt project. +Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! +In the next steps, you will take a sample query and turn it into a model in your dbt project. -### Checkout a new git branch +## Checkout a new git branch Check out a new git branch to work on new code: 1. Create a new branch by using the `checkout` command and passing the `-b` flag: - ```terminal - $ git checkout -b add-customers-model - > Switched to a new branch `add-customer-model` - ``` +```shell +$ git checkout -b add-customers-model +> Switched to a new branch `add-customer-model` +``` + +## Build your first model -### Build your first model 1. Open your project in your favorite code editor. 2. Create a new SQL file in the `models` directory, named `models/customers.sql`. 3. Paste the following query into the `models/customers.sql` file. - + 4. From the command line, enter `dbt run`. -
      - -
      +
      + +
      When you return to the BigQuery console, you can `select` from this model. -#### FAQs +### FAQs @@ -206,210 +209,210 @@ When you return to the BigQuery console, you can `select` from this model. -### Change the way your model is materialized +## Change the way your model is materialized -### Delete the example models +## Delete the example models -### Build models on top of other models +## Build models on top of other models 1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. 2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. - + -
      +
      - + - ```sql - select - id as customer_id, - first_name, - last_name +```sql +select + id as customer_id, + first_name, + last_name - from `dbt-tutorial`.jaffle_shop.customers - ``` +from `dbt-tutorial`.jaffle_shop.customers +``` - + - + - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status - from `dbt-tutorial`.jaffle_shop.orders - ``` +from `dbt-tutorial`.jaffle_shop.orders +``` - + -
      +
      -
      +
      - + - ```sql - select - id as customer_id, - first_name, - last_name +```sql +select + id as customer_id, + first_name, + last_name - from jaffle_shop_customers - ``` +from jaffle_shop_customers +``` - + - + - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status - from jaffle_shop_orders - ``` +from jaffle_shop_orders +``` - + -
      +
      -
      +
      - + - ```sql - select - id as customer_id, - first_name, - last_name +```sql +select + id as customer_id, + first_name, + last_name - from jaffle_shop.customers - ``` +from jaffle_shop.customers +``` - + - + - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status - from jaffle_shop.orders - ``` +from jaffle_shop.orders +``` - + -
      +
      -
      +
      - + - ```sql - select - id as customer_id, - first_name, - last_name +```sql +select + id as customer_id, + first_name, + last_name - from raw.jaffle_shop.customers - ``` +from raw.jaffle_shop.customers +``` - + - + - ```sql - select - id as order_id, - user_id as customer_id, - order_date, - status +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status - from raw.jaffle_shop.orders - ``` +from raw.jaffle_shop.orders +``` - + -
      +
      -
      +
      3. Edit the SQL in your `models/customers.sql` file as follows: - + + +```sql +with customers as ( - ```sql - with customers as ( + select * from {{ ref('stg_customers') }} - select * from {{ ref('stg_customers') }} +), - ), +orders as ( - orders as ( + select * from {{ ref('stg_orders') }} - select * from {{ ref('stg_orders') }} +), - ), +customer_orders as ( - customer_orders as ( + select + customer_id, - select - customer_id, + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders - min(order_date) as first_order_date, - max(order_date) as most_recent_order_date, - count(order_id) as number_of_orders + from orders - from orders + group by 1 - group by 1 +), - ), +final as ( - final as ( + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders - select - customers.customer_id, - customers.first_name, - customers.last_name, - customer_orders.first_order_date, - customer_orders.most_recent_order_date, - coalesce(customer_orders.number_of_orders, 0) as number_of_orders + from customers - from customers + left join customer_orders using (customer_id) - left join customer_orders using (customer_id) +) - ) +select * from final - select * from final - - ``` +``` - + 4. Execute `dbt run`. - This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. +This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. -#### FAQs {#faq-2} +### FAQs {#faq-2} @@ -424,13 +427,11 @@ You can also explore: * The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL. * The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the select statements that are running and the python logging happening when dbt runs. -## Test and document your project - -### Add tests to your models +## Add tests to your models -### Document your models +## Document your models @@ -446,7 +447,7 @@ You can also explore: -### Commit updated changes +## Commit updated changes You need to commit the changes you made to the project so that the repository has your latest code. @@ -457,4 +458,10 @@ You need to commit the changes you made to the project so that the repository ha ## Schedule a job -We recommend using dbt Cloud to schedule a job. For more information about using dbt Core to schedule a job, see [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post or [deployments](/docs/deploy/deployments). +We recommend using dbt Cloud as the easiest and most reliable way to [deploy jobs](/docs/deploy/deployments) and automate your dbt project in production. + +For more info on how to get started, refer to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs). + + + +For more information about using dbt Core to schedule a job, refer [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post. diff --git a/website/docs/quickstarts/redshift-qs.md b/website/docs/quickstarts/redshift-qs.md index fc7e178f163..67f66d6e275 100644 --- a/website/docs/quickstarts/redshift-qs.md +++ b/website/docs/quickstarts/redshift-qs.md @@ -31,8 +31,8 @@ You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamen ### Related content - Learn more with [dbt Courses](https://courses.getdbt.com/collections) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) diff --git a/website/docs/quickstarts/snowflake-qs.md b/website/docs/quickstarts/snowflake-qs.md index 6d03586e611..33e253e8c15 100644 --- a/website/docs/quickstarts/snowflake-qs.md +++ b/website/docs/quickstarts/snowflake-qs.md @@ -35,8 +35,8 @@ You can also watch the [YouTube video on dbt and Snowflake](https://www.youtube. - Learn more with [dbt Courses](https://courses.getdbt.com/collections) - [How we configure Snowflake](https://blog.getdbt.com/how-we-configure-snowflake/) -- [dbt Cloud CI job](/docs/deploy/continuous-integration) -- [Job triggers](/docs/deploy/job-triggers) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) - [Job notifications](/docs/deploy/job-notifications) - [Source freshness](/docs/deploy/source-freshness) @@ -138,7 +138,7 @@ There are two ways to connect dbt Cloud to Snowflake. The first option is Partne -Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials. +Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials. 1. In the Snowflake UI, click on the home icon in the upper left corner. In the left sidebar, select **Admin**. Then, select **Partner Connect**. Find the dbt tile by scrolling or by searching for dbt in the search bar. Click the tile to connect to dbt. diff --git a/website/docs/quickstarts/starburst-galaxy-qs.md b/website/docs/quickstarts/starburst-galaxy-qs.md index d9bd3b98a43..33228710509 100644 --- a/website/docs/quickstarts/starburst-galaxy-qs.md +++ b/website/docs/quickstarts/starburst-galaxy-qs.md @@ -68,7 +68,7 @@ Using Starburst Galaxy, you can create tables and also transform them with dbt. ``` ## Connect Starburst Galaxy to the Amazon S3 bucket {#connect-to-s3-bucket} -If your Starburst Galaxy instance is not already connected to your S3 bucket, you need to create a cluster, create a catalog that connects to the S3 bucket, associate the new catalog (your data source) to your new cluster, and configure privilege settings. +If your Starburst Galaxy instance is not already connected to your S3 bucket, you need to create a cluster, configure a catalog that allows Starburst Galaxy to connect to the S3 bucket, add the catalog to your new cluster, and configure privilege settings. In addition to Amazon S3, Starburst Galaxy supports many other data sources. To learn more about them, you can refer to the [Catalogs overview](https://docs.starburst.io/starburst-galaxy/catalogs/index.html) in the Starburst Galaxy docs. @@ -79,11 +79,11 @@ In addition to Amazon S3, Starburst Galaxy supports many other data sources. To When done, click **Create cluster**. -1. Create a catalog. Click **Catalogs** on the left sidebar of the Starburst Galaxy UI, then click **Create catalog** in the main body of the page. -2. On the **Create a data source** page, select the Amazon S3 tile. -3. In the **Name and description** section of the **Amazon S3** page, fill out the fields. -4. In the **Authentication to S3** section of the **Amazon S3** page, select the [AWS (S3) authentication mechanism](#prerequisites) you chose to connect with. -5. In the **Metastore configuration** section, set these options: +3. Create a catalog. Click **Catalogs** on the left sidebar of the Starburst Galaxy UI, then click **Create catalog** in the main body of the page. +4. On the **Create a data source** page, select the Amazon S3 tile. +5. In the **Name and description** section of the **Amazon S3** page, fill out the fields. +6. In the **Authentication to S3** section of the **Amazon S3** page, select the [AWS (S3) authentication mechanism](#prerequisites) you chose to connect with. +7. In the **Metastore configuration** section, set these options: - **Default S3 bucket name** — Enter the name of your S3 bucket you want to access. - **Default directory name** — Enter the folder name of where the Jaffle Shop data lives in the S3 bucket. This is the same folder name you used in [Load data to an Amazon S3 bucket](#load-data-to-s3). - **Allow creating external tables** — Enable this option. @@ -93,19 +93,19 @@ In addition to Amazon S3, Starburst Galaxy supports many other data sources. To -7. Click **Test connection**. This verifies that Starburst Galaxy can access your S3 bucket. -8. Click **Connect catalog** if the connection test passes. +8. Click **Test connection**. This verifies that Starburst Galaxy can access your S3 bucket. +9. Click **Connect catalog** if the connection test passes. -9. On the **Set permissions** page, click **Skip**. You can add permissions later if you want. -10. On the **Add to cluster** page, choose the cluster you want to add the data source to from the dropdown and click **Add to cluster**. -11. Add the location privilege for your S3 bucket to your role in Starburst Galaxy. Click **Access control > Roles and privileges** on the left sidebar of the Starburst Galaxy UI. Then, in the **Roles** table, click the role name **accountadmin**. +10. On the **Set permissions** page, click **Skip**. You can add permissions later if you want. +11. On the **Add to cluster** page, choose the cluster you want to add the catalog to from the dropdown and click **Add to cluster**. +12. Add the location privilege for your S3 bucket to your role in Starburst Galaxy. Click **Access control > Roles and privileges** on the left sidebar of the Starburst Galaxy UI. Then, in the **Roles** table, click the role name **accountadmin**. If you're using an existing Starburst Galaxy cluster and don't have access to the accountadmin role, then select a role that you do have access to. To learn more about access control, refer to [Access control](https://docs.starburst.io/starburst-galaxy/security/access-control.html) in the Starburst Galaxy docs. -1. On the **Roles** page, click the **Privileges** tab and click **Add privilege**. -2. On the **Add privilege** page, set these options: +13. On the **Roles** page, click the **Privileges** tab and click **Add privilege**. +14. On the **Add privilege** page, set these options: - **What would you like to modify privileges for?** — Choose **Location**. - **Enter a storage location provide** — Enter the storage location of _your S3 bucket_ and the folder of where the Jaffle Shop data lives. Make sure to include the `/*` at the end of the location. - **Create SQL** — Enable the option. @@ -115,7 +115,7 @@ In addition to Amazon S3, Starburst Galaxy supports many other data sources. To ## Create tables with Starburst Galaxy -To query the Jaffle Shop data with Starburst Galaxy, you need to create tables using the Jaffle Shop data that you [loaded to your S3 bucket](#load-data-to-s3). You can do this (and run any SQL statement) from the [query editor](https://docs.starburst.io/starburst-galaxy/query/index.html). +To query the Jaffle Shop data with Starburst Galaxy, you need to create tables using the Jaffle Shop data that you [loaded to your S3 bucket](#load-data-to-s3). You can do this (and run any SQL statement) from the [query editor](https://docs.starburst.io/starburst-galaxy/query/query-editor.html). 1. Click **Query > Query editor** on the left sidebar of the Starburst Galaxy UI. The main body of the page is now the query editor. 2. Configure the query editor so it queries your S3 bucket. In the upper right corner of the query editor, select your cluster in the first gray box and select your catalog in the second gray box: diff --git a/website/docs/reference/analysis-properties.md b/website/docs/reference/analysis-properties.md index 008da70f9db..fbc7b05538f 100644 --- a/website/docs/reference/analysis-properties.md +++ b/website/docs/reference/analysis-properties.md @@ -28,10 +28,3 @@ analyses: ``` - - - - -* `v0.16.0`: The ability to declare analysis properties was introduced. - - diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md index b20c1548d99..859fde7c908 100644 --- a/website/docs/reference/artifacts/dbt-artifacts.md +++ b/website/docs/reference/artifacts/dbt-artifacts.md @@ -3,12 +3,15 @@ title: "About dbt artifacts" sidebar_label: "About dbt artifacts" --- -With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power: +With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`semantic_manifest.json`, `manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power: + - [documentation](/docs/collaborate/documentation) - [state](/reference/node-selection/syntax#about-node-selection) - [visualizing source freshness](/docs/build/sources#snapshotting-source-data-freshness) They could also be used to: + +- gain insights into your [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) - calculate project-level test coverage - perform longitudinal analysis of run timing - identify historical changes in structure @@ -19,6 +22,7 @@ dbt has produced artifacts since the release of dbt-docs in v0.11.0. Starting in ## When are artifacts produced? Most dbt commands (and corresponding RPC methods) produce artifacts: +- [semantic manifest](/docs/dbt-cloud-apis/sl-manifest): Lives in the `/target` directory of your dbt project and stores various artifacts (such as compiled models and tests) generated during the execution of your project. - [manifest](/reference/artifacts/manifest-json): produced by commands that read and understand your project - [run results](/reference/artifacts/run-results-json): produced by commands that run, compile, or catalog nodes in your DAG - [catalog](catalog-json): produced by `docs generate` @@ -26,8 +30,6 @@ Most dbt commands (and corresponding RPC methods) produce artifacts: ## Common metadata -New in v0.19.0 - All artifacts produced by dbt include a `metadata` dictionary with these properties: - `dbt_version`: Version of dbt that produced this artifact. diff --git a/website/docs/reference/artifacts/manifest-json.md b/website/docs/reference/artifacts/manifest-json.md index 3a916ed6d4c..5e8dcedd2d5 100644 --- a/website/docs/reference/artifacts/manifest-json.md +++ b/website/docs/reference/artifacts/manifest-json.md @@ -53,12 +53,4 @@ You can refer to [dbt JSON Schema](https://schemas.getdbt.com/) for info on desc **Note**: The `manifest.json` version number is related to (but not _equal_ to) your dbt version, so you _must_ use the correct `manifest.json` version for your dbt version. To find the correct `manifest.json` version, select the dbt version on the top navigation (such as `v1.5`). -Use the following table to understand how the versioning pattern works and match the Manifest version with the dbt version: - -| dbt version | Manifest version | -| ----------- | ---------------- | -| `v1.5` | [Manifest v9](https://schemas.getdbt.com/dbt/manifest/v9/index.html) -| `v1.4` | [Manifest v8](https://schemas.getdbt.com/dbt/manifest/v8/index.html) -| `v1.3` | [Manifest v7](https://schemas.getdbt.com/dbt/manifest/v7/index.html) -| `v1.2` | [Manifest v6](https://schemas.getdbt.com/dbt/manifest/v6/index.html) -| `v1.1` | [Manifest v5](https://schemas.getdbt.com/dbt/manifest/v5/index.html) +Refer to the table at the beginning of [this page](/reference/artifacts/manifest-json) to understand how the Manifest version matches the dbt version. diff --git a/website/docs/reference/commands/clean.md b/website/docs/reference/commands/clean.md index 0185b701740..23a3f6080ce 100644 --- a/website/docs/reference/commands/clean.md +++ b/website/docs/reference/commands/clean.md @@ -4,12 +4,6 @@ sidebar_label: "clean" id: "clean" --- - - -- **v1.0.0:** `dbt_modules` has been replaced by `dbt_packages` by default for the [clean-target](/reference/project-configs/clean-targets) for packages. - - - `dbt clean` is a utility function that deletes all folders specified in the [`clean-targets`](/reference/project-configs/clean-targets) list specified in `dbt_project.yml`. You can use this to delete the `dbt_packages` and `target` directories. To avoid complex permissions issues and potentially deleting crucial aspects of the remote file system without access to fix them, this command does not work when interfacing with the RPC server that powers the dbt Cloud IDE. Instead, when working in dbt Cloud, the `dbt deps` command cleans before it installs packages automatically. The `target` folder can be manually deleted from the sidebar file tree if needed. diff --git a/website/docs/reference/commands/clone.md b/website/docs/reference/commands/clone.md index 32c8a89be04..a3c8bb236c7 100644 --- a/website/docs/reference/commands/clone.md +++ b/website/docs/reference/commands/clone.md @@ -13,7 +13,7 @@ The `dbt clone` command clones selected nodes from the [specified state](/refere The `clone` command is useful for: - blue/green continuous deployment (on data warehouses that support zero-copy cloning tables) - cloning current production state into development schema(s) -- handling incremental models in Slim CI dbt Cloud jobs (on data warehouses that support zero-copy cloning tables) +- handling incremental models in dbt Cloud CI jobs (on data warehouses that support zero-copy cloning tables) - testing code changes on downstream dependencies in your BI tool ```bash diff --git a/website/docs/reference/commands/compile.md b/website/docs/reference/commands/compile.md index 97d989a140b..ed403d2af32 100644 --- a/website/docs/reference/commands/compile.md +++ b/website/docs/reference/commands/compile.md @@ -67,8 +67,8 @@ select * from renamed The command accesses the data platform to cache-related metadata, and to run introspective queries. Use the flags: -- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query. -- `--no-introspect` to disable introspective queries. dbt will raise an error if a model's definition requires running one. +- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query. This is a `dbt` flag, which means you need to add `dbt` as a prefix. For example: `dbt --no-populate-cache`. +- `--no-introspect` to disable [introspective queries](/faqs/warehouse/db-connection-dbt-compile#introspective-queries). dbt will raise an error if a model's definition requires running one. This is a `dbt compile` flag, which means you need to add `dbt compile` as a prefix. For example:`dbt compile --no-introspect`. ### FAQs diff --git a/website/docs/reference/commands/init.md b/website/docs/reference/commands/init.md index 468bee5ff60..873647814ec 100644 --- a/website/docs/reference/commands/init.md +++ b/website/docs/reference/commands/init.md @@ -29,35 +29,6 @@ If you've just cloned or downloaded an existing dbt project, `dbt init` can stil - **Existing project:** If you're the maintainer of an existing project, and you want to help new users get connected to your database quickly and easily, you can include your own custom `profile_template.yml` in the root of your project, alongside `dbt_project.yml`. For common connection attributes, set the values in `fixed`; leave user-specific attributes in `prompts`, but with custom hints and defaults as you'd like. - - - - -```yml -fixed: - account: abc123 - authenticator: externalbrowser - database: analytics - role: transformer - type: snowflake - warehouse: transforming -prompts: - user: - type: string - hint: yourname@jaffleshop.com - schema: - type: string - hint: usually dbt_ - threads: - hint: "your favorite number, 1-10" - type: int - default: 8 -``` - - - - - diff --git a/website/docs/reference/commands/retry.md b/website/docs/reference/commands/retry.md index 0c010ede2c1..d494a46cf1f 100644 --- a/website/docs/reference/commands/retry.md +++ b/website/docs/reference/commands/retry.md @@ -4,6 +4,14 @@ sidebar_label: "retry" id: "retry" --- +:::info Support in dbt Cloud + +`dbt retry` is supported in the dbt Cloud IDE. + +Native support for restarting scheduled runs from point of failure is currently in development & coming soon. + +::: + `dbt retry` re-executes the last `dbt` command from the node point of failure. If the previously executed `dbt` command was successful, `retry` will finish as `no operation`. Retry works with the following commands: @@ -20,3 +28,80 @@ Retry works with the following commands: `dbt retry` reuses the [selectors](/reference/node-selection/yaml-selectors) from the previously executed command. + +Example results of executing `dbt retry` after a successful `dbt run`: + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Nothing to do. Try checking your model configs and model specification args +``` + +Example of when `dbt run` encounters a syntax error in a model: + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 5 START sql view model main.stg_customers ................................. [RUN] +2 of 5 START sql view model main.stg_orders .................................... [RUN] +3 of 5 START sql view model main.stg_payments .................................. [RUN] +1 of 5 OK created sql view model main.stg_customers ............................ [OK in 0.06s] +2 of 5 OK created sql view model main.stg_orders ............................... [OK in 0.06s] +3 of 5 OK created sql view model main.stg_payments ............................. [OK in 0.07s] +4 of 5 START sql table model main.customers .................................... [RUN] +5 of 5 START sql table model main.orders ....................................... [RUN] +4 of 5 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s] +5 of 5 OK created sql table model main.orders .................................. [OK in 0.04s] + +Finished running 3 view models, 2 table models in 0 hours 0 minutes and 0.15 seconds (0.15s). + +Completed with 1 error and 0 warnings: + +Runtime Error in model customers (models/customers.sql) + Parser Error: syntax error at or near "selct" + +Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5 +``` + + +Example of a subsequent failed `dbt retry` run without fixing the error(s): + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 1 START sql table model main.customers .................................... [RUN] +1 of 1 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s] + +Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5 +``` + +Example of a successful `dbt retry` run after fixing error(s): + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 1 START sql table model main.customers .................................... [RUN] +1 of 1 OK created sql table model main.customers ............................... [OK in 0.05s] + +Finished running 1 table model in 0 hours 0 minutes and 0.09 seconds (0.09s). + +Completed successfully + +Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 +``` + +In each scenario `dbt retry` picks up from the error rather than running all of the upstream dependencies again. diff --git a/website/docs/reference/commands/rpc.md b/website/docs/reference/commands/rpc.md index a98799356ee..809eadee639 100644 --- a/website/docs/reference/commands/rpc.md +++ b/website/docs/reference/commands/rpc.md @@ -5,22 +5,18 @@ id: "rpc" description: "Remote Procedure Call (rpc) dbt server compiles and runs queries, and provides methods that enable you to list and terminate running processes. " --- - +:::caution The dbt-rpc plugin is deprecated - - **v0.14**: The `dbt rpc` command was introduced to dbt Core - - **v1.0**: We now distribute and package the Remote Procedure Call (rpc) server functionality separately from `dbt-core`. You can find the code in a dedicated [`dbt-rpc` repository](https://github.com/dbt-labs/dbt-rpc). - +dbt Labs actively maintained `dbt-rpc` for compatibility with dbt-core versions up to v1.5. Starting with dbt-core v1.6 (released in July 2023), `dbt-rpc` is no longer supported for ongoing compatibility. -### Overview +In the meantime, dbt Labs will be performing critical maintenance only for `dbt-rpc`, until the last compatible version of dbt-core has reached the [end of official support](/docs/dbt-versions/core#latest-releases). At that point, dbt Labs will archive this repository to be read-only. -You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context. +::: -:::caution Deprecation -**The dbt-rpc plugin will be fully deprecated by the second half of 2023.** +### Overview -dbt Labs is actively maintaining `dbt-rpc` up to dbt v1.4. Starting in v1.5, we intend to break `dbt-rpc` compatibility in favor of [the new dbt Server](https://github.com/dbt-labs/dbt-server). dbt Labs will perform critical maintenance only on `dbt-rpc`, until the last compatible version of dbt has reached the end of official support (thus 12 months after release of v1.4; [see Core version policies](/docs/dbt-versions/core)). -::: +You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context. :::caution Running on Windows We do not recommend running the rpc server on Windows because of reliability issues. A Docker container may provide a useful workaround, if required. diff --git a/website/docs/reference/commands/run.md b/website/docs/reference/commands/run.md index fbc1a513cb1..557d0d71338 100644 --- a/website/docs/reference/commands/run.md +++ b/website/docs/reference/commands/run.md @@ -71,32 +71,12 @@ For more information on running parents or children of specific models, see the ## Treat warnings as errors - - -- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0 - - - -See [global configs](/reference/global-configs/failing-fast) +See [global configs](/reference/global-configs/warnings) ## Failing fast - - -- The `--fail-fast` flag is new in dbt v0.17.0 -- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0 - - - See [global configs](/reference/global-configs/failing-fast) ## Enable or Disable Colorized Logs - - -- The `--use-colors` and `--no-use-colors` flags are new in dbt v0.18.0 -- Moved to [global configs](/reference/global-configs/about-global-configs) in v1.0 - - - See [global configs](/reference/global-configs/print-output#print-color) diff --git a/website/docs/reference/commands/seed.md b/website/docs/reference/commands/seed.md index 272a2a7f2a9..8a410706842 100644 --- a/website/docs/reference/commands/seed.md +++ b/website/docs/reference/commands/seed.md @@ -4,20 +4,11 @@ sidebar_label: "seed" id: "seed" --- - - -- **v1.0.0:** The default config for this command will now be `seed-paths` instead of `data-paths`. - - - - The `dbt seed` command will load `csv` files located in the `seed-paths` directory of your dbt project into your . ### Selecting seeds to run - Added in v0.16.0 - Specific seeds can be run using the `--select` flag to `dbt seed`. Example: ``` diff --git a/website/docs/reference/dbt-classes.md b/website/docs/reference/dbt-classes.md index 18569fce3b0..13f9263e545 100644 --- a/website/docs/reference/dbt-classes.md +++ b/website/docs/reference/dbt-classes.md @@ -10,6 +10,7 @@ These classes are often useful when building advanced dbt models and macros. The `Relation` object is used to interpolate schema and names into SQL code with appropriate quoting. This object should _always_ be used instead of interpolating values with `{{ schema }}.{{ table }}` directly. Quoting of the Relation object can be configured using the [`quoting` config](/reference/project-configs/quoting). + ### Creating relations A `Relation` can be created by calling the `create` class method on the `Relation` class. @@ -32,6 +33,7 @@ class Relation: ### Using relations +In addition to `api.Relation.create`, dbt returns a Relation when you use [`ref`](/reference/dbt-jinja-functions/ref), [`source`](/reference/dbt-jinja-functions/source) or [`this`](/reference/dbt-jinja-functions/this). ```jinja2 @@ -84,6 +86,7 @@ col = Column('name', 'varchar', 255) col.is_string() # True col.is_numeric() # False col.is_number() # False +col.is_integer() # False col.is_float() # False col.string_type() # character varying(255) col.numeric_type('numeric', 12, 4) # numeric(12,4) @@ -101,15 +104,10 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4) ### Instance methods - - - The `is_number` and `is_float` instance methods were added dbt v0.16.0 - - - - **is_string()**: Returns True if the column is a String type (eg. text, varchar), else False - **is_numeric()**: Returns True if the column is a fixed-precision Numeric type (eg. `numeric`), else False - **is_number()**: Returns True if the column is a number-y type (eg. `numeric`, `int`, `float`, or similar), else False +- **is_integer()**: Returns True if the column is an integer (eg. `int`, `bigint`, `serial` or similar), else False - **is_float()**: Returns True if the column is a float type (eg. `float`, `float64`, or similar), else False - **string_size()**: Returns the width of the column if it is a string type, else, an exception is raised @@ -134,6 +132,9 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4) -- Return true if the column is a number {{ string_column.is_number() }} +-- Return true if the column is an integer +{{ string_column.is_integer() }} + -- Return true if the column is a float {{ string_column.is_float() }} @@ -149,6 +150,9 @@ col.numeric_type('numeric', 12, 4) # numeric(12,4) -- Return true if the column is a number {{ numeric_column.is_number() }} +-- Return true if the column is an integer +{{ numeric_column.is_integer() }} + -- Return true if the column is a float {{ numeric_column.is_float() }} @@ -184,12 +188,6 @@ will be expanded to: ## Result objects - - -* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification. - - - The execution of a resource in dbt generates a `Result` object. This object contains information about the executed node, timing, status, and metadata returned by the adapter. At the end of an invocation, dbt records these objects in [`run_results.json`](/reference/artifacts/run-results-json). - `node`: Full object representation of the dbt resource (model, seed, snapshot, test) executed, including its `unique_id` @@ -197,7 +195,6 @@ The execution of a resource in dbt generates a `Result` object. This object cont - `thread_id`: Which thread executed this node? E.g. `Thread-1` - `execution_time`: Total time spent executing this node, measured in seconds. - `timing`: Array that breaks down execution time into steps (often `compile` + `execute`) -- `adapter_response`: Dictionary of metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc. - `message`: How dbt will report this result on the CLI, based on information returned from the database import RowsAffected from '/snippets/_run-result.md'; diff --git a/website/docs/reference/dbt-commands.md b/website/docs/reference/dbt-commands.md index 5b37f13a3fb..862829ef809 100644 --- a/website/docs/reference/dbt-commands.md +++ b/website/docs/reference/dbt-commands.md @@ -3,16 +3,45 @@ title: "dbt Command reference" --- dbt is typically run one of two ways: + * In [dbt Cloud](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) * On the [command line interface](/docs/core/about-the-cli) (CLI) -The following sections outline the commands supported by dbt and their relevant flags. Note that some commands are only supported when using the CLI. - -For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax). +The following sections outline the commands supported by dbt and their relevant flags. For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax). ### Available commands -Select the tabs that are relevant to the your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**. + + +Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [CLI](/docs/core/about-the-cli). Use the `dbt` prefix. For example, to run the `test` command, type `dbt test`. + +| Command | Description | Version | +| ------- | ----------- | ------- | +| [build](/reference/commands/build) | Build and test all selected resources (models, seeds, snapshots, tests) | All [supported versions](/docs/dbt-versions/core) | +| [clean](/reference/commands/clean) | Deletes artifacts present in the dbt project | All [supported versions](/docs/dbt-versions/core) | +| [clone](/reference/commands/clone) | Clone selected models from the specified state | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [compile](/reference/commands/compile) | Compiles (but does not run) the models in a project | All [supported versions](/docs/dbt-versions/core) | +| [debug](/reference/commands/debug) | Debugs dbt connections and projects | All [supported versions](/docs/dbt-versions/core) | +| [deps](/reference/commands/deps) | Downloads dependencies for a project | All [supported versions](/docs/dbt-versions/core) | +| [docs](/reference/commands/cmd-docs) | Generates documentation for a project | All [supported versions](/docs/dbt-versions/core) | +| [list](/reference/commands/list) | Lists resources defined in a dbt project | All [supported versions](/docs/dbt-versions/core) | +| [parse](/reference/commands/parse) | Parses a project and writes detailed timing info | All [supported versions](/docs/dbt-versions/core) | +| [retry](/reference/commands/retry) | Retry the last run `dbt` command from the point of failure | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [run](/reference/commands/run) | Runs the models in a project | All [supported versions](/docs/dbt-versions/core) | +| [run-operation](/reference/commands/run-operation) | Invoke a macro, including running arbitrary maintenance SQL against
      the database | All [supported versions](/docs/dbt-versions/core) | +| [seed](/reference/commands/seed) | Loads CSV files into the database | All [supported versions](/docs/dbt-versions/core) | +| [show](/reference/commands/show) | Preview table rows post-transformation | All [supported versions](/docs/dbt-versions/core) | +| [snapshot](/reference/commands/snapshot) | Executes "snapshot" jobs defined in a project | All [supported versions](/docs/dbt-versions/core) | +| [source](/reference/commands/source) | Provides tools for working with source data (including validating that
      sources are "fresh") | All [supported versions](/docs/dbt-versions/core) | +| [test](/reference/commands/test) | Executes tests defined in a project | All [supported versions](/docs/dbt-versions/core) | +| [init](/reference/commands/init) | Initializes a new dbt project (CLI only) | All [supported versions](/docs/dbt-versions/core) | + + +
      + + + +Select the tabs that are relevant to your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**. @@ -24,7 +53,7 @@ Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/ - [compile](/reference/commands/compile): compiles (but does not run) the models in a project - [deps](/reference/commands/deps): downloads dependencies for a project - [docs](/reference/commands/cmd-docs) : generates documentation for a project -- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or higher) +- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or later) - [run](/reference/commands/run): runs the models in a project - [run-operation](/reference/commands/run-operation): invoke a macro, including running arbitrary maintenance SQL against the database - [seed](/reference/commands/seed): loads CSV files into the database @@ -62,27 +91,4 @@ Use the following dbt commands in the [CLI](/docs/core/about-the-cli) and use th - - - + diff --git a/website/docs/reference/dbt-jinja-functions/as_bool.md b/website/docs/reference/dbt-jinja-functions/as_bool.md index e0700032212..d4c2bbf1743 100644 --- a/website/docs/reference/dbt-jinja-functions/as_bool.md +++ b/website/docs/reference/dbt-jinja-functions/as_bool.md @@ -24,10 +24,3 @@ models: ```
      - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_bool` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_native.md b/website/docs/reference/dbt-jinja-functions/as_native.md index fca25249dca..1de9ad45bf9 100644 --- a/website/docs/reference/dbt-jinja-functions/as_native.md +++ b/website/docs/reference/dbt-jinja-functions/as_native.md @@ -16,10 +16,3 @@ and [`as_number`](/reference/dbt-jinja-functions/as_number) instead. Unlike `as_bool` and `as_number`, `as_native` will return a rendered value regardless of the input type. Ensure that your inputs match expectations. ::: - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_native` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_number.md b/website/docs/reference/dbt-jinja-functions/as_number.md index 057d7ec8d20..29b35094880 100644 --- a/website/docs/reference/dbt-jinja-functions/as_number.md +++ b/website/docs/reference/dbt-jinja-functions/as_number.md @@ -25,10 +25,3 @@ my_profile: ```
      - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_number` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_text.md b/website/docs/reference/dbt-jinja-functions/as_text.md index 5e19e5bc9bc..6b26cfa327d 100644 --- a/website/docs/reference/dbt-jinja-functions/as_text.md +++ b/website/docs/reference/dbt-jinja-functions/as_text.md @@ -56,12 +56,3 @@ models: ``` - - - -* `v0.17.0`: Native rendering is enabled by default. The `as_text` filter was -introduced. -* `v0.17.1`: Native rendering is disabled by default. The `as_text` filter works -as before, with no functional effect. - - diff --git a/website/docs/reference/dbt-jinja-functions/builtins.md b/website/docs/reference/dbt-jinja-functions/builtins.md index 40848705dc4..edc5f34ffda 100644 --- a/website/docs/reference/dbt-jinja-functions/builtins.md +++ b/website/docs/reference/dbt-jinja-functions/builtins.md @@ -1,10 +1,11 @@ --- -title: "About builtins Jinja function" +title: "About builtins Jinja variable" sidebar_label: "builtins" id: "builtins" -description: "Read this guide to understand the builtins Jinja function in dbt." +description: "Read this guide to understand the builtins Jinja variable in dbt." --- + The `builtins` variable exists to provide references to builtin dbt context methods. This allows macros to be created with names that _mask_ dbt builtin context methods, while still making those methods accessible in the dbt compilation context. The `builtins` variable is a dictionary containing the following keys: @@ -15,9 +16,51 @@ The `builtins` variable is a dictionary containing the following keys: ## Usage -The following macro overrides the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`. +:::important + +Using the `builtins` variable in this way is an advanced development workflow. Users should be ready to maintain and update these overrides when upgrading in the future. +::: + + + +From dbt v1.5 and higher, use the following macro to extract user-provided arguments, including version, and call the builtins.ref() function with either a single modelname argument or both packagename and modelname arguments, based on the number of positional arguments in varargs: + +

      + ``` +{% macro ref() %} +-- extract user-provided positional and keyword arguments + {% set version = kwargs.get('version') %} + {% set packagename = none %} + {%- if (varargs | length) == 1 -%} + {% set modelname = varargs[0] %} +{%- else -%} + {% set packagename = varargs[0] %} + {% set modelname = varargs[1] %} +{% endif %} +-- call builtins.ref based on provided positional arguments +{% set rel = None %} +{% if packagename is not none %} + {% set rel = return(builtins.ref(packagename, modelname, version=version)) %} +{% else %} + {% set rel = return(builtins.ref(modelname, version=version)) %} +{% endif %} + +-- finally, override the database name with "dev" +{% set newrel = rel.replace_path(database="dev") %} +{% do return(newrel) %} + +{% endmacro %} +``` +
      + + + +From dbt v1.4 and lower, use the following macro to override the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`: + +``` + {% macro ref(model_name) %} {% set rel = builtins.ref(model_name) %} @@ -26,6 +69,7 @@ The following macro overrides the `ref` method available in the model compilatio {% endmacro %} ``` + The ref macro can also be used to control which elements of the model path are rendered when run, for example the following macro overrides the `ref` method to render only the schema and object identifier, but not the database reference i.e. `my_schema.my_model` rather than `my_database.my_schema.my_model`. This is especially useful when using snowflake as a warehouse, if you intend to change the name of the database post-build and wish the references to remain accurate. diff --git a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md index e0701e5d091..0d377d29cef 100644 --- a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md +++ b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md @@ -1,22 +1,23 @@ --- -title: " About dbt_project.yml context variables" +title: " About dbt_project.yml context" sidebar_label: "dbt_project.yml context" id: "dbt-project-yml-context" -description: "The context variables and methods are available when configuring resources in the dbt_project.yml file." +description: "The context methods and variables available when configuring resources in the dbt_project.yml file." --- -The following context variables and methods are available when configuring +The following context methods and variables are available when configuring resources in the `dbt_project.yml` file. This applies to the `models:`, `seeds:`, and `snapshots:` keys in the `dbt_project.yml` file. +**Available context methods:** +- [env_var](/reference/dbt-jinja-functions/env_var) +- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) + **Available context variables:** - [target](/reference/dbt-jinja-functions/target) -- [env_var](/reference/dbt-jinja-functions/env_var) -- [vars](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) - [builtins](/reference/dbt-jinja-functions/builtins) - [dbt_version](/reference/dbt-jinja-functions/dbt_version) - ### Example configuration diff --git a/website/docs/reference/dbt-jinja-functions/dispatch.md b/website/docs/reference/dbt-jinja-functions/dispatch.md index a165ae59eb0..5dff787219f 100644 --- a/website/docs/reference/dbt-jinja-functions/dispatch.md +++ b/website/docs/reference/dbt-jinja-functions/dispatch.md @@ -5,12 +5,6 @@ id: "dispatch" description: "dbt extends functionality across data platforms using multiple dispatch." --- - - -- **v1.0.0:** The 'packages' argument is fully deprecated. Use `macro_namespace` and project-level `dispatch` config instead. - - - dbt can extend functionality across [Supported Data Platforms](/docs/supported-data-platforms) through a system of [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch). Because SQL syntax, data types, and / support vary across adapters, dbt can define and call generic functional macros, and then "dispatch" that macro to the appropriate implementation for the current adapter. ## Syntax diff --git a/website/docs/reference/dbt-jinja-functions/env_var.md b/website/docs/reference/dbt-jinja-functions/env_var.md index a5e9df82415..f4cc05cec0f 100644 --- a/website/docs/reference/dbt-jinja-functions/env_var.md +++ b/website/docs/reference/dbt-jinja-functions/env_var.md @@ -58,12 +58,6 @@ models: ### Secrets - - - **v1.0.0:** Restricted use of secret env vars to `profiles.yml` and `packages.yml` - - - For certain configurations, you can use "secret" env vars. Any env var named with the prefix `DBT_ENV_SECRET_` will be: - Available for use in `profiles.yml` + `packages.yml`, via the same `env_var()` function - Disallowed everywhere else, including `dbt_project.yml` and model SQL, to prevent accidentally writing these secret values to the or metadata artifacts @@ -82,12 +76,6 @@ host: "www.{{ env_var('DBT_ENV_SECRET_HOST_DOMAIN') }}.com/{{ env_var('DBT_ENV_S ### Custom metadata - - - - **v0.19.0:** Introduced `DBT_ENV_CUSTOM_ENV_` prefix and artifact `metadata.env` - - - Any env var named with the prefix `DBT_ENV_CUSTOM_ENV_` will be included in two places, with its prefix-stripped name as the key: - [dbt artifacts](/reference/artifacts/dbt-artifacts#common-metadata): `metadata` -> `env` - [events and structured logs](/reference/events-logging#info-fields): `info` -> `extra` diff --git a/website/docs/reference/dbt-jinja-functions/graph.md b/website/docs/reference/dbt-jinja-functions/graph.md index 3b3b4d1cb88..491b7836f45 100644 --- a/website/docs/reference/dbt-jinja-functions/graph.md +++ b/website/docs/reference/dbt-jinja-functions/graph.md @@ -99,7 +99,7 @@ representations of those nodes. A simplified example might look like: }, "exposures": { "exposure.my_project.traffic_dashboard": { - "unique_id": "source.my_project.traffic_dashboard", + "unique_id": "exposure.my_project.traffic_dashboard", "type": "dashboard", "maturity": "high", "path": "models/path/to/schema.yml", diff --git a/website/docs/reference/dbt-jinja-functions/log.md b/website/docs/reference/dbt-jinja-functions/log.md index ec4533ea621..30e68f8c21d 100644 --- a/website/docs/reference/dbt-jinja-functions/log.md +++ b/website/docs/reference/dbt-jinja-functions/log.md @@ -12,7 +12,34 @@ __Args__: Logs a line to either the log file or stdout. -([Source on GitHub](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/context/base.py#L432)) +
      + Code source + Refer to GitHub or the following code as a source:

      + +```python + def log(msg: str, info: bool = False) -> str: + """Logs a line to either the log file or stdout. + + :param msg: The message to log + :param info: If `False`, write to the log file. If `True`, write to + both the log file and stdout. + + > macros/my_log_macro.sql + + {% macro some_macro(arg1, arg2) %} + {{ log("Running some_macro: " ~ arg1 ~ ", " ~ arg2) }} + {% endmacro %}" + """ + if info: + fire_event(JinjaLogInfo(msg=msg, node_info=get_node_info())) + else: + fire_event(JinjaLogDebug(msg=msg, node_info=get_node_info())) + return "" +``` + + + +
      ```sql diff --git a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md index ff0f7c1ef33..32cd8ca10ff 100644 --- a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md +++ b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md @@ -100,12 +100,6 @@ on-run-end: ## Results - - -* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification. - - - The `results` variable contains a list of [Result objects](/reference/dbt-classes#result-objects) with one element per resource that executed in the dbt job. The Result object provides access within the Jinja on-run-end context to the information that will populate the [run results JSON artifact](/reference/artifacts/run-results-json). Example usage: diff --git a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md index 037a129476e..2a6390c3d12 100644 --- a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md +++ b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md @@ -1,16 +1,16 @@ --- -title: "About profiles.yml context variable" +title: "About profiles.yml context" sidebar_label: "profiles.yml context" id: "profiles-yml-context" -description: "Use these context variables to configure resources in `profiles.yml` file." +description: "Use these context methods to configure resources in `profiles.yml` file." --- -The following context variables and methods are available when configuring +The following context methods are available when configuring resources in the `profiles.yml` file. -**Available context variables:** +**Available context methods:** - [env_var](/reference/dbt-jinja-functions/env_var) -- [vars](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) +- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) ### Example usage diff --git a/website/docs/reference/dbt-jinja-functions/project_name.md b/website/docs/reference/dbt-jinja-functions/project_name.md index 38717aa16c3..7f76c5a4800 100644 --- a/website/docs/reference/dbt-jinja-functions/project_name.md +++ b/website/docs/reference/dbt-jinja-functions/project_name.md @@ -5,8 +5,6 @@ id: "project_name" description: "Read this guide to understand the project_name Jinja function in dbt." --- -New in 0.16.0 - The `project_name` context variable returns the `name` for the root-level project which is being run by dbt. This variable can be used to defer execution to a root-level project macro if one exists. diff --git a/website/docs/reference/dbt-jinja-functions/ref.md b/website/docs/reference/dbt-jinja-functions/ref.md index c500bb934ab..fda5992e234 100644 --- a/website/docs/reference/dbt-jinja-functions/ref.md +++ b/website/docs/reference/dbt-jinja-functions/ref.md @@ -29,11 +29,8 @@ from {{ref('model_a')}} `ref()` is, under the hood, actually doing two important things. First, it is interpolating the schema into your model file to allow you to change your deployment schema via configuration. Second, it is using these references between models to automatically build the dependency graph. This will enable dbt to deploy models in the correct order when using `dbt run`. -:::info New in 0.9.0 - -The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes at the [{{ this }}](/reference/dbt-jinja-functions/this) variable. - -::: +The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes as the [{{ this }} variable](/reference/dbt-jinja-functions/this). + - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`. ## Advanced ref usage @@ -73,7 +70,7 @@ select * from {{ ref('model_name') }} ### Two-argument variant -There is also a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity. +You can also use a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity. When using two arguments with projects (not packages), you also need to set [cross project dependencies](/docs/collaborate/govern/project-dependencies). ```sql select * from {{ ref('project_or_package', 'model_name') }} diff --git a/website/docs/reference/dbt-jinja-functions/selected_resources.md b/website/docs/reference/dbt-jinja-functions/selected_resources.md index 80c4250b8d5..a927ec317ae 100644 --- a/website/docs/reference/dbt-jinja-functions/selected_resources.md +++ b/website/docs/reference/dbt-jinja-functions/selected_resources.md @@ -30,6 +30,8 @@ For a given run it will look like: ["model.my_project.model1", "model.my_project.model2", "snapshot.my_project.my_snapshot"] ``` +Each value corresponds to a key in the `nodes` object within the [graph](/reference/dbt-jinja-functions/graph) context variable. + It can be used in macros in a `pre-hook`, `post-hook`, `on-run-start` or `on-run-end` to evaluate what nodes are selected and trigger different logic whether a particular node is selected or not. diff --git a/website/docs/reference/dbt-jinja-functions/source.md b/website/docs/reference/dbt-jinja-functions/source.md index 2d73e79f09c..59317a79e3d 100644 --- a/website/docs/reference/dbt-jinja-functions/source.md +++ b/website/docs/reference/dbt-jinja-functions/source.md @@ -16,6 +16,7 @@ This function: - Creates dependencies between a source and the current model, which is useful for documentation and model selection - Compiles to the full object name in the database + ## Related guides - [Using sources](/docs/build/sources) diff --git a/website/docs/reference/dbt-jinja-functions/statement-blocks.md b/website/docs/reference/dbt-jinja-functions/statement-blocks.md index 1ad4f099aa3..2829ad3fe14 100644 --- a/website/docs/reference/dbt-jinja-functions/statement-blocks.md +++ b/website/docs/reference/dbt-jinja-functions/statement-blocks.md @@ -41,12 +41,6 @@ Once the statement block has executed, the result set is accessible via the `loa - `data`: Pythonic representation of data returned by query (arrays, tuples, dictionaries). - `table`: [Agate](https://agate.readthedocs.io/page/api/table.html) table representation of data returned by query. - - -* `v0.19.0`: The `response` structured object replaced a `status` string that contained similar information. - - - For the above statement, that could look like: diff --git a/website/docs/reference/dbt-jinja-functions/this.md b/website/docs/reference/dbt-jinja-functions/this.md index 9065c660cb0..f9f2961b08f 100644 --- a/website/docs/reference/dbt-jinja-functions/this.md +++ b/website/docs/reference/dbt-jinja-functions/this.md @@ -3,13 +3,18 @@ title: "about this" sidebar_label: "this" id: "this" description: "Represents the current model in the database." +keywords: + - relation, relation object, this function, this jinja, this.database, this.schema, this.identifier +meta: + label: 'this' --- `this` is the database representation of the current model. It is useful when: - Defining a `where` statement within [incremental models](/docs/build/incremental-models) - Using [pre or post hooks](/reference/resource-configs/pre-hook-post-hook) -`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected. +`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected. + - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`. `this` can be thought of as equivalent to `ref('')`, and is a neat way to avoid circular dependencies. @@ -17,24 +22,6 @@ description: "Represents the current model in the database." - - -### Grant permissions on a model in a post-hook - - - -```yaml -models: - project-name: - +post-hook: - - "grant select on {{ this }} to db_reader" -``` - - - - - - ### Configuring incremental models @@ -54,3 +41,7 @@ from raw_app_data.events ``` + + + + \ No newline at end of file diff --git a/website/docs/reference/dbt_project.yml.md b/website/docs/reference/dbt_project.yml.md index 59541a81256..c706b57a73b 100644 --- a/website/docs/reference/dbt_project.yml.md +++ b/website/docs/reference/dbt_project.yml.md @@ -1,8 +1,3 @@ - - -- **v1.0.0:** The default config name for `data-paths` is now [`seed-paths`](/reference/project-configs/seed-paths), `source-paths` is now [`model-paths`](/reference/project-configs/model-paths) and `modules-path` is now [`packages-install-path`](/reference/project-configs/packages-install-path). - - Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate on your project. diff --git a/website/docs/reference/global-configs/cache.md b/website/docs/reference/global-configs/cache.md index db4eabd14b7..a605e1e70f3 100644 --- a/website/docs/reference/global-configs/cache.md +++ b/website/docs/reference/global-configs/cache.md @@ -17,7 +17,7 @@ There are two ways to optionally modify this behavior: For example, to quickly compile a model that requires no database metadata or introspective queries: ```text -dbt --skip-populate-cache compile --select my_model_name +dbt --no-populate-cache compile --select my_model_name ``` @@ -31,7 +31,7 @@ dbt --cache-selected-only run --select salesforce
      - + ### Cache database objects for selected resource @@ -63,4 +63,4 @@ config: - \ No newline at end of file + diff --git a/website/docs/reference/global-configs/logs.md b/website/docs/reference/global-configs/logs.md index f5f1b3f814b..8c819193fc6 100644 --- a/website/docs/reference/global-configs/logs.md +++ b/website/docs/reference/global-configs/logs.md @@ -14,6 +14,9 @@ The `LOG_FORMAT` config specifies how dbt's logs should be formatted. If the val dbt --log-format json run {"code": "A001", "data": {"v": "=1.0.0"}, "invocation_id": "1193e449-4b7a-4eb1-8e8e-047a8b3b7973", "level": "info", "log_version": 1, "msg": "Running with dbt=1.0.0", "node_info": {}, "pid": 35098, "thread_name": "MainThread", "ts": "2021-12-03T10:46:59.928217Z", "type": "log_line"} ``` + + + To set the `LOG_FORMAT_FILE` type output for the file without impacting the console log format, use the `log-format-file` flag. @@ -37,8 +40,6 @@ See [structured logging](/reference/events-logging#structured-logging) for more ::: - - ### Log Level @@ -124,7 +125,16 @@ dbt --quiet run ### Color -You can set the color preferences for the file logs only using the `--use-colors-file / --no-use-colors-file` flags. +You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags. + + + +```yaml +config: + use_colors_file: False +``` + + ```text dbt --use-colors-file run diff --git a/website/docs/reference/global-configs/print-output.md b/website/docs/reference/global-configs/print-output.md index 83280677229..112b92b546f 100644 --- a/website/docs/reference/global-configs/print-output.md +++ b/website/docs/reference/global-configs/print-output.md @@ -74,13 +74,24 @@ config: use_colors: False ``` + + ```text dbt --use-colors run dbt --no-use-colors run ``` -You can set the color preferences for the file logs only using the `--use-colors-file / --no-use-colors-file` flags. +You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags. + + + +```yaml +config: + use_colors_file: False +``` + + ```text dbt --use-colors-file run @@ -88,5 +99,3 @@ dbt --no-use-colors-file run ``` - - \ No newline at end of file diff --git a/website/docs/reference/global-configs/usage-stats.md b/website/docs/reference/global-configs/usage-stats.md index ea02fe0bb59..1f9492f4a43 100644 --- a/website/docs/reference/global-configs/usage-stats.md +++ b/website/docs/reference/global-configs/usage-stats.md @@ -8,14 +8,14 @@ We want to build the best version of dbt possible, and a crucial part of that is Usage statistics are fired when dbt is invoked and when models are run. These events contain basic platform information (OS + python version) and metadata such as whether the invocation succeeded, how long it took, an anonymized hash key representing the raw model content, and number of nodes that were run. You can see all the event definitions in [`tracking.py`](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/tracking.py). -By default this is turned on – you can opt out of event tracking at any time by adding the following to your `profiles.yml` file: +By default, this is enabled. dbt Core users can opt out of event tracking at any time by adding the following to your `profiles.yml` file: ```yaml config: send_anonymous_usage_stats: False ``` -You can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). +dbt Core users can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). `DO_NOT_TRACK=1` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=False` `DO_NOT_TRACK=0` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=True` diff --git a/website/docs/reference/macro-properties.md b/website/docs/reference/macro-properties.md index 91ba52de9ca..9919835f3c5 100644 --- a/website/docs/reference/macro-properties.md +++ b/website/docs/reference/macro-properties.md @@ -27,9 +27,3 @@ macros: ``` - - - -* `v0.16.0`: The ability to declare macro properties was introduced. - - diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md index 6079e53793a..e13a4f6648a 100644 --- a/website/docs/reference/node-selection/defer.md +++ b/website/docs/reference/node-selection/defer.md @@ -2,13 +2,6 @@ title: "Defer" --- - - -- **v0.18.0**: Introduced `--defer` and `--state` flags as beta features. -- **v0.19.0**: Changed `--defer` to use the current environment's resource, if it exists, and only fall back to the other environment's resource if the first does not. Also added support for `dbt test --defer`. - - - Defer is a powerful feature that makes it possible to run a subset of models or tests in a [sandbox environment](/docs/environments-in-dbt) without having to first build their upstream parents. This can save time and computational resources when you want to test a small number of models in a large project. Defer requires that a manifest from a previous dbt invocation be passed to the `--state` flag or env var. Together with the `state:` selection method, these features enable "Slim CI". Read more about [state](/reference/node-selection/syntax#about-node-selection). diff --git a/website/docs/reference/node-selection/graph-operators.md b/website/docs/reference/node-selection/graph-operators.md index 1e7c88fadfc..4fdc2f10628 100644 --- a/website/docs/reference/node-selection/graph-operators.md +++ b/website/docs/reference/node-selection/graph-operators.md @@ -34,12 +34,3 @@ The `@` operator is similar to `+`, but will also include _the parents of the ch ```bash $ dbt run --models @my_model # select my_model, its children, and the parents of its children ``` - -### The "star" operator -The `*` operator matches all models within a package or directory. - - - ```bash - $ dbt run --select snowplow.* # run all of the models in the snowplow package - $ dbt run --select finance.base.* # run all of the models in models/finance/base - ``` diff --git a/website/docs/reference/node-selection/methods.md b/website/docs/reference/node-selection/methods.md index ff86d60c06a..2647f3416a3 100644 --- a/website/docs/reference/node-selection/methods.md +++ b/website/docs/reference/node-selection/methods.md @@ -47,13 +47,19 @@ The `source` method is used to select models that select from a specified [sourc ``` ### The "resource_type" method -Use the `resource_type` method to select nodes of a particular type (`model`, `source`, `exposure`, etc). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list). +Use the `resource_type` method to select nodes of a particular type (`model`, `test`, `exposure`, and so on). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list). ```bash $ dbt build --select resource_type:exposure # build all resources upstream of exposures $ dbt list --select resource_type:test # list all tests in your project ``` +Note: This method doesn't work for sources, so use the [`--resource-type`](/reference/commands/list) option of the list command instead: + + ```bash + $ dbt list --resource-type source + ``` + ### The "path" method The `path` method is used to select models/sources defined at or under a specific path. Model definitions are in SQL/Python files (not YAML), and source definitions are in YAML files. @@ -74,19 +80,28 @@ selectors unambiguous. -### The "file" or "fqn" method -The `file` or `fqn` method can be used to select a model by its filename, including the file extension (`.sql`). +### The "file" method +The `file` method can be used to select a model by its filename, including the file extension (`.sql`). ```bash # These are equivalent dbt run --select file:some_model.sql dbt run --select some_model.sql dbt run --select some_model -dbt run --select fqn:some_model # fqn is an abbreviation for "fully qualified name" ``` +### The "fqn" method + +The `fqn` method is used to select nodes based off their "fully qualified names" (FQN) within the dbt graph. The default output of [`dbt list`](/reference/commands/list) is a listing of FQN. + +``` +dbt run --select fqn:some_model +dbt run --select fqn:your_project.some_model +dbt run --select fqn:some_package.some_other_model +``` + ### The "package" method The `package` method is used to select models defined within the root project @@ -142,9 +157,6 @@ $ dbt ls -s config.transient:true ### The "test_type" method - -In v1.0.0, test types were renamed: "singular" (instead of "data") and "generic" (instead of "schema") - The `test_type` method is used to select tests based on their type, `singular` or `generic`: @@ -230,7 +242,6 @@ The `exposure` method is used to select parent resources of a specified [exposur ``` ### The "metric" method -New in v1.0.0 The `metric` method is used to select parent resources of a specified [metric](/docs/build/metrics). Use in conjunction with the `+` operator. @@ -240,7 +251,6 @@ $ dbt ls --select +metric:* --resource-type source # list all source tables ``` ### The "result" method -New in v1.0.0 The `result` method is related to the `state` method described above and can be used to select resources based on their result status from a prior run. Note that one of the dbt commands [`run`, `test`, `build`, `seed`] must have been performed in order to create the result on which a result selector operates. You can use `result` selectors in conjunction with the `+` operator. @@ -252,13 +262,6 @@ $ dbt seed --select result:error --state path/to/artifacts # run all seeds that ``` ### The "source_status" method - - -Supported in v1.1 or newer. - - - - Supported in v1.1 or higher. @@ -290,9 +293,6 @@ $ dbt build --select source_status:fresher+ --state path/to/prod/artifacts - - - ### The "group" method diff --git a/website/docs/reference/node-selection/state-comparison-caveats.md b/website/docs/reference/node-selection/state-comparison-caveats.md index 6ae156fddcf..baeeb7e4c75 100644 --- a/website/docs/reference/node-selection/state-comparison-caveats.md +++ b/website/docs/reference/node-selection/state-comparison-caveats.md @@ -60,13 +60,6 @@ dbt will do its best to capture *only* changes that are the result of modificati - iterative improvements to dbt's built-in detective abilities - better options for more complex projects, in the form of more-specific subselectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704)) - - -- v0.18.0: All env-aware logic results in false positives during state comparison, when comparing against a manifest generated with a different target. -- v0.19.0: dbt stores and compares unrendered Jinja expressions for configurations, allowing it to see past env-aware logic in `dbt_project.yml`. - - - State comparison is now able to detect env-aware config in `dbt_project.yml`. For instance, this target-based config would register as a modification in v0.18.0, but in v0.19.0 it no longer will: diff --git a/website/docs/reference/node-selection/syntax.md b/website/docs/reference/node-selection/syntax.md index 1a43a32e2bc..7c165b0f4ff 100644 --- a/website/docs/reference/node-selection/syntax.md +++ b/website/docs/reference/node-selection/syntax.md @@ -174,14 +174,6 @@ $ dbt run --select result:+ state:modified+ --defer --state ./ - -Only supported by v1.1 or newer. - - - - - Only supported by v1.1 or newer. When a job is selected, dbt Cloud will surface the artifacts from that job's most recent successful run. dbt will then use those artifacts to determine the set of fresh sources. In your job commands, you can signal to dbt to run and test only on these fresher sources and their children by including the `source_status:fresher+` argument. This requires both previous and current state to have the `sources.json` artifact be available. Or plainly said, both job states need to run `dbt source freshness`. @@ -194,18 +186,10 @@ dbt source freshness dbt build --select source_status:fresher+ ``` - For more example commands, refer to [Pro-tips for workflows](/guides/legacy/best-practices.md#pro-tips-for-workflows). ### The "source_status" status - - -Only supported by v1.1 or newer. - - - - Only supported by v1.1 or newer. @@ -221,4 +205,3 @@ After issuing one of the above commands, you can reference the source freshness $ dbt source freshness # must be run again to compare current to previous state $ dbt build --select source_status:fresher+ --state path/to/prod/artifacts ``` - diff --git a/website/docs/reference/node-selection/test-selection-examples.md b/website/docs/reference/node-selection/test-selection-examples.md index 6a884b2035d..52439d95d97 100644 --- a/website/docs/reference/node-selection/test-selection-examples.md +++ b/website/docs/reference/node-selection/test-selection-examples.md @@ -11,22 +11,10 @@ Like all resource types, tests can be selected **directly**, by methods and oper Unlike other resource types, tests can also be selected **indirectly**. If a selection method or operator includes a test's parent(s), the test will also be selected. [See below](#indirect-selection) for more details. - - - `v1.0.0`: Renamed the `--greedy` flag/property to `indirect_selection`, and set its default back to "eager" (pre-v0.20). You can achieve the "cautious" behavior introduced in v0.20 by setting the flag/property to `cautious`. - - - Test selection is powerful, and we know it can be tricky. To that end, we've included lots of examples below: ### Direct selection - - -`v1.0.0`: Renamed test types: "generic" (formerly "schema") and "singular" (formerly "data"). Removed support for the `--schema` and `--data` flags. - - - Run generic tests only: @@ -58,7 +46,7 @@ The "cautious" mode can be useful in environments when you're only building a su - + There are three modes to configure the behavior when performing indirect selection (with `eager` as the default): @@ -72,6 +60,21 @@ The "buildable" and "cautious" modes can be useful in environments when you're o + + +These are the modes to configure the behavior when performing indirect selection (with `eager` as the default): + +1. `eager` (default) - include ANY test that references the selected nodes +1. `cautious` - restrict to tests that ONLY refer to selected nodes +1. `buildable` - restrict to tests that ONLY refer to selected nodes (or their ancestors) +1. `empty` - restrict to tests that are only for the selected node and ignore all tests from the attached nodes + +Note that test exclusion is always greedy: if ANY parent is explicitly excluded, the test will be excluded as well. + +The "buildable", "cautious", and "empty" modes can be useful in environments when you're only building a subset of your DAG, and you want to avoid test failures in "eager" mode caused by unbuilt resources. (Another way to achieve this is with [deferral](/reference/node-selection/defer)). + + + @@ -109,7 +112,7 @@ $ dbt build --select orders --indirect-selection=cautious - + @@ -159,7 +162,68 @@ $ dbt build --select orders --indirect-selection=buildable - + + + + + +By default, a test will run when ANY parent is selected; we call this "eager" indirect selection. In this example, that would include any test that references orders, even if it references other models as well. + +In this mode, any test that depends on unbuilt resources will raise an error. + +```shell +$ dbt test --select orders +$ dbt build --select orders +``` + + + + + +It is possible to prevent tests from running if one or more of its parents is unselected (and therefore unbuilt); we call this "cautious" indirect selection. + +It will only include tests whose references are each within the selected nodes. + +Put another way, it will prevent tests from running if one or more of its parents is unselected. + +```shell +$ dbt test --select orders --indirect-selection=cautious +$ dbt build --select orders --indirect-selection=cautious +``` + + + + + +This mode is similarly conservative like "cautious", but is slightly more inclusive. + +It will only include tests whose references are each within the selected nodes (or their ancestors). + +This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input). + +```shell +$ dbt test --select orders --indirect-selection=buildable +$ dbt build --select orders --indirect-selection=buildable +``` + + + + + +This mode will only include tests whose references are each within the selected nodes and will ignore all tests from attached nodes. + +```shell +$ dbt test --select orders --indirect-selection=empty +$ dbt build --select orders --indirect-selection=empty +``` + + + + + + + + ### Syntax examples diff --git a/website/docs/reference/node-selection/yaml-selectors.md b/website/docs/reference/node-selection/yaml-selectors.md index 95c3c791b53..78342e32779 100644 --- a/website/docs/reference/node-selection/yaml-selectors.md +++ b/website/docs/reference/node-selection/yaml-selectors.md @@ -78,7 +78,7 @@ definition: - + ```yml definition: @@ -95,7 +95,29 @@ definition: childrens_parents: true | false # @ operator - indirect_selection: eager | cautious | buildable # include all tests selected indirectly? eager by default + indirect_selection: eager | cautious | buildable # include all tests selected indirectly? eager by default +``` + + + + + +```yml +definition: + method: tag + value: nightly + + # Optional keywords map to the `+` and `@` graph operators: + + children: true | false + parents: true | false + + children_depth: 1 # if children: true, degrees to include + parents_depth: 1 # if parents: true, degrees to include + + childrens_parents: true | false # @ operator + + indirect_selection: eager | cautious | buildable | empty # include all tests selected indirectly? eager by default ``` @@ -168,7 +190,7 @@ If provided, a YAML selector's `indirect_selection` value will take precedence o - + As a general rule, dbt will indirectly select _all_ tests if they touch _any_ resource that you're selecting directly. We call this "eager" indirect selection. You can optionally switch the indirect selection mode to "cautious" or "buildable" by setting `indirect_selection` for a specific criterion: @@ -191,6 +213,32 @@ If provided, a YAML selector's `indirect_selection` value will take precedence o + + +As a general rule, dbt will indirectly select _all_ tests if they touch _any_ resource that you're selecting directly. We call this "eager" indirect selection. You can optionally switch the indirect selection mode to "cautious", "buildable", or "empty" by setting `indirect_selection` for a specific criterion: + +```yml +- union: + - method: fqn + value: model_a + indirect_selection: eager # default: will include all tests that touch model_a + - method: fqn + value: model_b + indirect_selection: cautious # will not include tests touching model_b + # if they have other unselected parents + - method: fqn + value: model_c + indirect_selection: buildable # will not include tests touching model_c + # if they have other unselected parents (unless they have an ancestor that is selected) + - method: fqn + value: model_d + indirect_selection: empty # will include tests for only the selected node and ignore all tests attached to model_d +``` + +If provided, a YAML selector's `indirect_selection` value will take precedence over the CLI flag `--indirect-selection`. Because `indirect_selection` is defined separately for _each_ selection criterion, it's possible to mix eager/cautious/buildable/empty modes within the same definition, to achieve the exact behavior that you need. Remember that you can always test out your critiera with `dbt ls --selector`. + + + See [test selection examples](/reference/node-selection/test-selection-examples) for more details about indirect selection. ## Example diff --git a/website/docs/reference/programmatic-invocations.md b/website/docs/reference/programmatic-invocations.md index 8bd9bf84047..6afcd65c1bc 100644 --- a/website/docs/reference/programmatic-invocations.md +++ b/website/docs/reference/programmatic-invocations.md @@ -30,7 +30,7 @@ Each command returns a `dbtRunnerResult` object, which has three attributes: - `result`: If the command completed (successfully or with handled errors), its result(s). Return type varies by command. - `exception`: If the dbt invocation encountered an unhandled error and did not complete, the exception it encountered. -There is a 1:1 correspondence between [CLI exit codes](reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation: +There is a 1:1 correspondence between [CLI exit codes](/reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation: | Scenario | CLI Exit Code | `success` | `result` | `exception` | |---------------------------------------------------------------------------------------------|--------------:|-----------|-------------------|-------------| diff --git a/website/docs/reference/project-configs/asset-paths.md b/website/docs/reference/project-configs/asset-paths.md index 97204923cb9..1fb3cf9f260 100644 --- a/website/docs/reference/project-configs/asset-paths.md +++ b/website/docs/reference/project-configs/asset-paths.md @@ -15,12 +15,6 @@ asset-paths: [directorypath] ## Definition Optionally specify a custom list of directories to copy to the `target` directory as part of the `docs generate` command. This is useful for rendering images in your repository in your project documentation. - - -* `v0.18.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details. - - - ## Default By default, dbt will not copy any additional files as part of docs generate, i.e. `asset-paths: []` diff --git a/website/docs/reference/project-configs/clean-targets.md b/website/docs/reference/project-configs/clean-targets.md index 119630b00b1..9b464840723 100644 --- a/website/docs/reference/project-configs/clean-targets.md +++ b/website/docs/reference/project-configs/clean-targets.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [target_path] --- - - -- **v1.0.0:** The `modules-path` has been updated to be [`packages-install-path`](/reference/project-configs/packages-install-path). The default value has also been updated to be `dbt-packages` from `dbt-modules`. - - - ```yml diff --git a/website/docs/reference/project-configs/config-version.md b/website/docs/reference/project-configs/config-version.md index 20947c03d62..804caf1328f 100644 --- a/website/docs/reference/project-configs/config-version.md +++ b/website/docs/reference/project-configs/config-version.md @@ -20,12 +20,7 @@ config-version: 2 ## Definition Specify your `dbt_project.yml` as using the v2 structure. - - -* `v0.17.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details. -* `v1.5.0`: This configuration was made optional. - - + This configuration is optional. ## Default Without this configuration, dbt will assume your `dbt_project.yml` uses the version 1 syntax, which was deprecated in dbt v0.19.0. diff --git a/website/docs/reference/project-configs/log-path.md b/website/docs/reference/project-configs/log-path.md index daab17c5f10..29cad35d120 100644 --- a/website/docs/reference/project-configs/log-path.md +++ b/website/docs/reference/project-configs/log-path.md @@ -47,12 +47,21 @@ The precedence order is: CLI flag > env var > `dbt_project.yml` ## Examples -### Write logs to a subdirectory named `dbt_logs` instead of `logs` +### Specify subdirectory using the project config file ```yml log-path: dbt_logs ``` - + + + +### Specify subdirectory from the command line + +```bash +dbt --log-path dbt_logs run +``` + + diff --git a/website/docs/reference/project-configs/model-paths.md b/website/docs/reference/project-configs/model-paths.md index 2129747af27..a0652432787 100644 --- a/website/docs/reference/project-configs/model-paths.md +++ b/website/docs/reference/project-configs/model-paths.md @@ -2,11 +2,6 @@ datatype: [directorypath] default_value: [models] --- - - -- **v1.0.0:** The config `source-paths` has been deprecated in favor of `model-paths`. - - diff --git a/website/docs/reference/project-configs/on-run-start-on-run-end.md b/website/docs/reference/project-configs/on-run-start-on-run-end.md index 2c5cde4c0c2..e1a3d7b761a 100644 --- a/website/docs/reference/project-configs/on-run-start-on-run-end.md +++ b/website/docs/reference/project-configs/on-run-start-on-run-end.md @@ -4,6 +4,8 @@ description: "Read this guide to understand the on-run-start and on-run-end conf datatype: sql-statement | [sql-statement] --- +import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md'; + ```yml @@ -15,14 +17,8 @@ on-run-end: sql-statement | [sql-statement] ## Definition -A SQL statement (or list of SQL statements) to be run at the start, or end, of the following commands: -- `dbt run` -- `dbt test` -- `dbt seed` -- `dbt snapshot` -- `dbt build` -- `dbt compile` -- `dbt docs generate` + +A SQL statement (or list of SQL statements) to be run at the start or end of the following commands: `on-run-start` and `on-run-end` hooks can also call macros that return SQL statements @@ -33,34 +29,6 @@ A SQL statement (or list of SQL statements) to be run at the start, or end, of t - - -### Grant privileges at the end of a run - - - -```yml -on-run-end: "grant select on all tables in schema {{ target.schema }} group transformer" - -``` - - - -### Grant multiple privileges at the end of a run - - - -```yml -on-run-end: - - "grant usage on schema {{ target.schema }} to group reporter" - - "grant select on all tables in schema {{ target.schema }} group reporter" - -``` - - - - - ### Grant privileges on all schemas that dbt uses at the end of a run This leverages the [schemas](/reference/dbt-jinja-functions/schemas) variable that is only available in an `on-run-end` hook. diff --git a/website/docs/reference/project-configs/packages-install-path.md b/website/docs/reference/project-configs/packages-install-path.md index 98142305357..157c630fd36 100644 --- a/website/docs/reference/project-configs/packages-install-path.md +++ b/website/docs/reference/project-configs/packages-install-path.md @@ -3,12 +3,6 @@ datatype: directorypath default_value: dbt_packages --- - - -- **v1.0.0:** The default config has changed from `modules-path` to `packages-install-path` with a new default value of `dbt_packages`. - - - ```yml diff --git a/website/docs/reference/project-configs/query-comment.md b/website/docs/reference/project-configs/query-comment.md index 4d72bd4fcff..b1a73605e55 100644 --- a/website/docs/reference/project-configs/query-comment.md +++ b/website/docs/reference/project-configs/query-comment.md @@ -30,14 +30,6 @@ A string to inject as a comment in each query that dbt runs against your databas The `query-comment` configuration can also call a macro that returns a string. - - -* `v0.15.0`: The `query-comment` configuration was introduced -* `v0.16.1`: Dictionary syntax introduced to allow comments to be appended -* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels - - - ## Default By default, dbt will insert a comment at the top of your query containing the information including the dbt version, profile and target names, and node ids for the resources it runs. For example: @@ -149,13 +141,6 @@ select ... ### BigQuery: include query comment items as job labels - - - -* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels - - - If `query-comment.job-label` is set to true, dbt will include the query comment items, if a dictionary, or the comment string, as job labels on the query it executes. These will be included in addition to labels specified in the [BigQuery-specific config](/reference/project-configs/query-comment#bigquery-include-query-comment-items-as-job-labels). diff --git a/website/docs/reference/project-configs/quoting.md b/website/docs/reference/project-configs/quoting.md index 92968ace1bd..821b920188c 100644 --- a/website/docs/reference/project-configs/quoting.md +++ b/website/docs/reference/project-configs/quoting.md @@ -28,13 +28,6 @@ Note that for BigQuery quoting configuration, `database` and `schema` should be ::: - - -* `v0.10.1`: This configuration was introduced with a default value of `true` for each adapter. -* `v0.11.0`: The default quoting config on Snowflake changed from `true` to `false` - - - ## Default The default values vary by database. diff --git a/website/docs/reference/project-configs/require-dbt-version.md b/website/docs/reference/project-configs/require-dbt-version.md index 892495dde45..85a502bff60 100644 --- a/website/docs/reference/project-configs/require-dbt-version.md +++ b/website/docs/reference/project-configs/require-dbt-version.md @@ -19,12 +19,6 @@ When you set this configuration, dbt sends a helpful error message for any user If this configuration is not specified, no version check will occur. - - -* `v0.13.0`: This configuration was introduced - - - :::info YAML Quoting This configuration needs to be interpolated by the YAML parser as a string. As such, you should quote the value of the configuration, taking care to avoid whitespace. For example: diff --git a/website/docs/reference/project-configs/seed-paths.md b/website/docs/reference/project-configs/seed-paths.md index 92f7c5aa91f..614bda62cd2 100644 --- a/website/docs/reference/project-configs/seed-paths.md +++ b/website/docs/reference/project-configs/seed-paths.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [data] --- - - -- **v1.0.0:** The config `data-paths` has been deprecated in favor of `seed-paths`. - - - ```yml diff --git a/website/docs/reference/project-configs/snapshot-paths.md b/website/docs/reference/project-configs/snapshot-paths.md index a623d48b20f..81b2759609d 100644 --- a/website/docs/reference/project-configs/snapshot-paths.md +++ b/website/docs/reference/project-configs/snapshot-paths.md @@ -14,12 +14,6 @@ snapshot-paths: [directorypath] ## Definition Optionally specify a custom list of directories where [snapshots](/docs/build/snapshots) are located. Note that you cannot co-locate models and snapshots. - - -* `v0.14.0`: Snapshots were introduced - - - ## Default By default, dbt will search for snapshots in the `snapshots` directory, i.e. `snapshot-paths: ["snapshots"]` diff --git a/website/docs/reference/project-configs/target-path.md b/website/docs/reference/project-configs/target-path.md index 54458efe512..fddc5a93c5e 100644 --- a/website/docs/reference/project-configs/target-path.md +++ b/website/docs/reference/project-configs/target-path.md @@ -48,12 +48,22 @@ The precedence order is: CLI flag > env var > `dbt_project.yml` ## Examples -### Use a subdirectory named `compiled` for compiled files +### Specify subdirectory using the project config file ```yml -target-path: "compiled" +target-path: "compiled_files" ``` + + + +### Specify subdirectory from the command line + +```bash +dbt run --target-path compiled_files +``` + + \ No newline at end of file diff --git a/website/docs/reference/project-configs/test-paths.md b/website/docs/reference/project-configs/test-paths.md index e3f3cd2ccce..e3d0e0b76fa 100644 --- a/website/docs/reference/project-configs/test-paths.md +++ b/website/docs/reference/project-configs/test-paths.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [test] --- - - -* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory - - - ```yml diff --git a/website/docs/reference/project-configs/version.md b/website/docs/reference/project-configs/version.md index 4c128727445..1c947412fcd 100644 --- a/website/docs/reference/project-configs/version.md +++ b/website/docs/reference/project-configs/version.md @@ -1,8 +1,13 @@ --- datatype: version required: True +keyword: project version, project versioning, dbt project versioning --- +import VersionsCallout from '/snippets/_version-callout.md'; + + + dbt projects have two distinct types of the `version` tags. This field has a different meaning depending on its location. diff --git a/website/docs/reference/resource-configs/bigquery-configs.md b/website/docs/reference/resource-configs/bigquery-configs.md index c425fd5b94b..89a750f47bd 100644 --- a/website/docs/reference/resource-configs/bigquery-configs.md +++ b/website/docs/reference/resource-configs/bigquery-configs.md @@ -21,26 +21,6 @@ This will allow you to read and write from multiple BigQuery projects. Same for ### Partition clause - - -Before dbt v0.16.0, the `partition_by` configuration was supplied as string. While -the string specification syntax is still supported in dbt v0.16.0, it has been -deprecated and will be removed in a future release. **Note:** partitioning configs -using a range bucket *must* be supplied using the dictionary-style configuration as of -dbt v0.16.0. - -Example usage for versions of dbt < 0.16.0: - -```sql --- Partitioning by a timestamp field -{{ config( materialized='table', partition_by="date(created_at)" ) }} - --- Partitioning by a date field -{{ config( materialized='table', partition_by="created_date" ) }} -``` - - - BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#pruning_limiting_partitions) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). The `partition_by` config can be supplied as a dictionary with the following format: @@ -61,7 +41,6 @@ The `partition_by` config can be supplied as a dictionary with the following for ``` #### Partitioning by a date or timestamp -Partitioning by hour, month or year is new in v0.19.0 When using a `datetime` or `timestamp` column to partition data, you can create partitions with a granularity of hour, day, month, or year. A `date` column supports granularity of day, month and year. Daily partitioning is the default for all column types. @@ -266,12 +245,6 @@ as ( #### Additional partition configs - - - - **v0.20.0:** Introduced `require_partition_filter` and `partition_expiration_days` - - - If your model has `partition_by` configured, you may optionally specify two additional configurations: - `require_partition_filter` (boolean): If set to `true`, anyone querying this model _must_ specify a partition filter, otherwise their query will fail. This is recommended for very large tables with obvious partitioning schemes, such as event streams grouped by day. Note that this will affect other dbt models or tests that try to select from this model, too. @@ -367,11 +340,7 @@ dbt supports the specification of BigQuery labels for the tables and BigQuery key-value pair entries for labels larger than 63 characters are truncated. **Configuring labels in a model file** @@ -489,12 +458,6 @@ strategy is selected. ### The `insert_overwrite` strategy - - - - **v0.16.0:** Introduced `insert_overwrite` incremental strategy - - - The `insert_overwrite` strategy generates a merge statement that replaces entire partitions in the destination table. **Note:** this configuration requires that the model is configured with a [Partition clause](#partition-clause). The `merge` statement that dbt generates @@ -587,12 +550,6 @@ _today_ and _yesterday_ every day that it is run. It is the fastest and cheapest way to incrementally update a table using dbt. If we wanted this to run more dynamically— let’s say, always for the past 3 days—we could leverage dbt’s baked-in [datetime macros](https://github.com/dbt-labs/dbt-core/blob/dev/octavius-catto/core/dbt/include/global_project/macros/etc/datetime.sql) and write a few of our own. - - - - **v0.19.0:** With the advent of truncated timestamp partitions in BigQuery, `timestamp`-type partitions are now treated as timestamps instead of dates for the purposes of filtering. Update `partitions_to_replace` accordingly. - - - Think of this as "full control" mode. You must ensure that expressions or literal values in the the `partitions` config have proper quoting when templated, and that they match the `partition_by.data_type` (`timestamp`, `datetime`, `date`, or `int64`). Otherwise, the filter in the incremental `merge` statement will raise an error. #### Dynamic partitions @@ -685,7 +642,6 @@ from {{ ref('events') }} ## Controlling table expiration -New in v0.18.0 By default, dbt-created tables never expire. You can configure certain model(s) to expire after a set number of hours by setting `hours_to_expiration`. @@ -721,8 +677,6 @@ select ... ## Authorized Views -New in v0.18.0 - If the `grant_access_to` config is specified for a model materialized as a view, dbt will grant the view model access to select from the list of datasets provided. See [BQ docs on authorized views](https://cloud.google.com/bigquery/docs/share-access-views) @@ -764,48 +718,3 @@ Views with this configuration will be able to select from objects in `project_1. The `grant_access_to` config is not thread-safe when multiple views need to be authorized for the same dataset. The initial `dbt run` operation after a new `grant_access_to` config is added should therefore be executed in a single thread. Subsequent runs using the same configuration will not attempt to re-apply existing access grants, and can make use of multiple threads. - - - -## Materialized view - -The BigQuery adapter supports [materialized views](https://cloud.google.com/bigquery/docs/materialized-views-intro) and refreshes them for every subsequent `dbt run` you execute. For more information, see [Refresh Materialized Views](https://cloud.google.com/bigquery/docs/materialized-views-manage#refresh) in the Google docs. - -Materialized views support the optional configuration `on_configuration_change` with the following values: -- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The following changes can be applied without the need to rebuild the materialized view: - - enable_refresh - - refresh_interval_minutes - - max_staleness -- `skip` — allows runs to continue while also providing a warning that the model was skipped -- `fail` — forces runs to fail if a change is detected in a materialized view - -You can create a materialized view by editing _one_ of these files: -- the SQL file for your model -- the `dbt_project.yml` configuration file - -The following examples create a materialized view: - - - -```sql -{{ - config( - materialized = 'materialized_view', - on_configuration_change = 'apply', - ) -}} -``` - - - - - - -```yaml -models: - path: - materialized: materialized_view -``` - - - diff --git a/website/docs/reference/resource-configs/contract.md b/website/docs/reference/resource-configs/contract.md index 91d87fd2716..e8ea6d82287 100644 --- a/website/docs/reference/resource-configs/contract.md +++ b/website/docs/reference/resource-configs/contract.md @@ -1,6 +1,6 @@ --- resource_types: [models] -description: "Read this guide to understand the contract configuration in dbt." +description: "When the contract configuration is enforced, dbt will ensure that your model's returned dataset exactly matches the attributes you have defined in yaml, such as name and data_type, as well as any additional constraints supported by the data platform." datatype: "{}" default_value: {contract: false} id: "contract" @@ -95,32 +95,3 @@ Imagine: - The result is a delta between the yaml-defined contract, and the actual table in the database - which means the contract is now incorrect! Why `append_new_columns`, rather than `sync_all_columns`? Because removing existing columns is a breaking change for contracted models! - -### Detecting breaking changes - -When you use the `state:modified` selection method in Slim CI, dbt will detect changes to model contracts, and raise an error if any of those changes could be breaking for downstream consumers. - -Breaking changes include: -- Removing an existing column -- Changing the `data_type` of an existing column -- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) - -``` -Breaking Change to Contract Error in model sometable (models/sometable.sql) - While comparing to previous project state, dbt detected a breaking change to an enforced contract. - - The contract's enforcement has been disabled. - - Columns were removed: - - order_name - - Columns with data_type changes: - - order_id (number -> int) - - Consider making an additive (non-breaking) change instead, if possible. - Otherwise, create a new model version: https://docs.getdbt.com/docs/collaborate/govern/model-versions -``` - -Additive changes are **not** considered breaking: -- Adding a new column to a contracted model -- Adding new `constraints` to an existing column in a contracted model diff --git a/website/docs/reference/resource-configs/database.md b/website/docs/reference/resource-configs/database.md index b4759d8b6f3..9c63b0ca457 100644 --- a/website/docs/reference/resource-configs/database.md +++ b/website/docs/reference/resource-configs/database.md @@ -12,7 +12,7 @@ This is a work in progress document. While this configuration applies to multipl ## Definition -Optionally specify a custom database for a [model](docs/build/models) or [seed](/docs/build/seeds). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)). +Optionally specify a custom database for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)). When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` @@ -22,14 +22,8 @@ The standard behavior of dbt is: To learn more about changing the way that dbt generates a relation's `database`, read [Using Custom Databases](/docs/build/custom-databases) - - -* `v0.13.0`: Support for the `database` config is added -* `v0.16.0`: The `generate_database_name` macro was added to control how the `database` config is used by dbt - - - ## Usage + ### Load seeds into the RAW database @@ -43,4 +37,4 @@ seeds: ## Warehouse specific information * BigQuery: `project` and `database` are interchangeable -* Redshift: Cross-database queries are not possible in Redshift. As such, dbt will return a Database Error if you use this configuration. + diff --git a/website/docs/reference/resource-configs/databricks-configs.md b/website/docs/reference/resource-configs/databricks-configs.md index 41b0bfcc5ea..e57e1efc04a 100644 --- a/website/docs/reference/resource-configs/databricks-configs.md +++ b/website/docs/reference/resource-configs/databricks-configs.md @@ -7,20 +7,41 @@ id: "databricks-configs" When materializing a model as `table`, you may include several optional configs that are specific to the dbt-databricks plugin, in addition to the standard [model configs](/reference/model-configs). -| Option | Description | Required? | Example | -|---------|------------------------------------------------------------------------------------------------------------------------------------|-------------------------|--------------------------| -| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta`| -| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` | -| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | -| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | -| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | + + +| Option | Description | Required? | Example | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|----------------| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | + + + + + + +| Option | Description | Required? | Model Support | Example | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|---------------|----------------| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | + + + ## Incremental models -dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values: +dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of four values: - **`append`** (default): Insert new records without updating or overwriting any existing data. - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the with new data. If no `partition_by` is specified, overwrite the entire table with new data. - - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`; update old records, insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`, updating old records, and inserting new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`replace_where`** (Delta file format only): Match records based on `incremental_predicates`, replacing all records that match the predicates from the existing table with records matching the predicates from the new data. (If no `incremental_predicates` are specified, all new data is inserted, similar to `append`.) Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. @@ -120,7 +141,7 @@ select date_day, count(*) as users -from events +from new_events group by 1 ``` @@ -247,6 +268,96 @@ merge into analytics.merge_incremental as DBT_INTERNAL_DEST
      +### The `replace_where` strategy + +The `replace_where` incremental strategy requires: +- `file_format: delta` +- Databricks Runtime 12.0 and above + +dbt will run an [atomic `replace where` statement](https://docs.databricks.com/en/delta/selective-overwrite.html#arbitrary-selective-overwrite-with-replacewhere) which selectively overwrites data matching one or more `incremental_predicates` specified as a string or array. Only rows matching the predicates will be inserted. If no `incremental_predicates` are specified, dbt will perform an atomic insert, as with `append`. + +:::caution + +`replace_where` inserts data into columns in the order provided, rather than by column name. If you reorder columns and the data is compatible with the existing schema, you may silently insert values into an unexpected column. If the incoming data is incompatible with the existing schema, you will instead receive an error. + +::: + + + + + + +```sql +{{ config( + materialized='incremental', + file_format='delta', + incremental_strategy = 'replace_where' + incremental_predicates = 'user_id >= 10000' # Never replace users with ids < 10000 +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + + + + + + + +```sql +create temporary view replace_where__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + user_id, + max(date_day) as last_seen + + from events + group by 1 + +; + +insert into analytics.replace_where_incremental + replace where user_id >= 10000 + table `replace_where__dbt_tmp` +``` + + + + + + + ## Persisting model descriptions Relation-level docs persistence is supported in dbt v0.17.0. For more @@ -280,3 +391,49 @@ snapshots: ``` + + + +## Materialized views and streaming tables + +Starting with version 1.6.0, the dbt-databricks adapter supports [materialized views](https://docs.databricks.com/en/sql/user/materialized-views.html) and [streaming tables](https://docs.databricks.com/en/sql/load-data-streaming-table.html), as alternatives to incremental tables that are powered by [Delta Live Tables](https://docs.databricks.com/en/delta-live-tables/index.html). +See [What are Delta Live Tables?](https://docs.databricks.com/en/delta-live-tables/index.html#what-are-delta-live-tables-datasets) for more information and use cases. +These features are still in preview, and the support in the dbt-databricks adapter should, for now, be considered _experimental_. +In order to adopt these materialization strategies, you will need a workspace that is enabled for Unity Catalog and serverless SQL Warehouses. + + + +```sql +{{ config( + materialized = 'materialized_view' + ) }} +``` + + + +or + + + +```sql +{{ config( + materialized = 'streaming_table' + ) }} +``` + + + +When dbt detects a pre-existing relation of one of these types, it issues a `REFRESH` [command](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-refresh-full.html). + +### Limitations + +As mentioned above, support for these materializations in the Databricks adapter is still limited. +At this time the following configuration options are not available: + +* Specifying a refresh schedule for these materializations +* Specifying `on_configuration_change` settings. + +Additionally, if you change the model definition of your materialized view or streaming table, you will need to drop the materialization in your warehouse directly before running dbt again; otherwise, you will get a refresh error. + +We plan to address these limitations during the 1.7.x timeframe. + diff --git a/website/docs/reference/resource-configs/docs.md b/website/docs/reference/resource-configs/docs.md index d94b975683d..0ccd21d7504 100644 --- a/website/docs/reference/resource-configs/docs.md +++ b/website/docs/reference/resource-configs/docs.md @@ -28,6 +28,7 @@ models: - name: model_name docs: show: true | false + node_color: "black" ``` @@ -113,22 +114,16 @@ macros: ``` - +Also refer to [macro properties](/reference/macro-properties).
      ## Definition -The docs field can be used to provide documentation-specific configuration to models. The only currently supported docs attribute is shown, which controls whether or not models are shown in the auto-generated documentation website. +The docs field can be used to provide documentation-specific configuration to models. It supports the doc attribute `show`, which controls whether or not models are shown in the auto-generated documentation website. It also supports `node_color` for some node types. **Note:** hidden models will still appear in the dbt DAG visualization but will be identified as "hidden.” - - -* `v0.16.0`: This property was added - - - ## Default The default value for `show` is `true`. @@ -173,7 +168,7 @@ models: ## Custom node colors -The `docs` attribute now supports `node_color` to customize the node color in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed. +The `docs` attribute now supports `node_color` to customize the display color of some node types in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed. `node_color` hiearchy: @@ -182,7 +177,7 @@ The `docs` attribute now supports `node_color` to customize the node color in th ## Examples -Add custom node colors to models within subdirectories based on hex codes or a plain color name. +Add custom `node_colors` to models that support it within subdirectories based on hex codes or a plain color name. ![Example](../../../../website/static/img/node_color_example.png) diff --git a/website/docs/reference/resource-configs/enabled.md b/website/docs/reference/resource-configs/enabled.md index 03d1598c931..b6d0961ee60 100644 --- a/website/docs/reference/resource-configs/enabled.md +++ b/website/docs/reference/resource-configs/enabled.md @@ -150,7 +150,6 @@ sources: - @@ -170,7 +169,6 @@ sources: - diff --git a/website/docs/reference/resource-configs/grants.md b/website/docs/reference/resource-configs/grants.md index 8ef726788dc..3a65672fa5e 100644 --- a/website/docs/reference/resource-configs/grants.md +++ b/website/docs/reference/resource-configs/grants.md @@ -121,7 +121,7 @@ For example: ```yml models: - +grants: + +grants: # In this case the + is not optional, you must include it for your project to parse. select: ['user_a', 'user_b'] ``` @@ -243,6 +243,7 @@ models: - Databricks automatically enables `grants` on SQL endpoints. For interactive clusters, admins should enable grant functionality using these two setup steps in the Databricks documentation: - [Enable table access control for your workspace](https://docs.databricks.com/administration-guide/access-control/table-acl.html) - [Enable table access control for a cluster](https://docs.databricks.com/security/access-control/table-acls/table-acl.html) +- In order to grant `READ_METADATA` or `USAGE`, use [post-hooks](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook) diff --git a/website/docs/reference/resource-configs/invalidate_hard_deletes.md b/website/docs/reference/resource-configs/invalidate_hard_deletes.md index 3e9f13b738d..ba5b37c5d71 100644 --- a/website/docs/reference/resource-configs/invalidate_hard_deletes.md +++ b/website/docs/reference/resource-configs/invalidate_hard_deletes.md @@ -4,7 +4,6 @@ description: "Invalidate_hard_deletes - Read this in-depth guide to learn about datatype: column_name --- -New in v0.19.0 ```jinja2 diff --git a/website/docs/reference/resource-configs/materialize-configs.md b/website/docs/reference/resource-configs/materialize-configs.md index 1338647a2a6..6976aa84061 100644 --- a/website/docs/reference/resource-configs/materialize-configs.md +++ b/website/docs/reference/resource-configs/materialize-configs.md @@ -8,11 +8,9 @@ id: "materialize-configs" ### Clusters - -- **v1.2.0:** Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). +Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). - The default [cluster](https://materialize.com/docs/overview/key-concepts/#clusters) that is used to maintain materialized views or indexes can be configured in your [profile](/docs/core/connect-data-platform/profiles.yml) using the `cluster` connection parameter. To override the cluster that is used for specific models (or groups of models), use the `cluster` configuration parameter. @@ -45,11 +43,7 @@ Materialize, at its core, is a real-time database that delivers incremental view ### Indexes - - -- **v1.2.0:** Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). - - +Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). Like in any standard relational database, you can use [indexes](https://materialize.com/docs/overview/key-concepts/#indexes) to optimize query performance in Materialize. Improvements can be significant, reducing response times down to single-digit milliseconds. @@ -85,12 +79,6 @@ select ... ### Tests - - -- **v1.1.1:** Provide support for storing the results of a test query in a materialized view using the `store_failures` config. - - - If you set the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures), dbt will create a materialized view for each configured test that can keep track of failures over time. By default, test views are created in a schema suffixed with `dbt_test__audit`. To specify a custom suffix, use the `schema` config. diff --git a/website/docs/reference/resource-configs/meta.md b/website/docs/reference/resource-configs/meta.md index 18cc13ae969..d24c5fbaee1 100644 --- a/website/docs/reference/resource-configs/meta.md +++ b/website/docs/reference/resource-configs/meta.md @@ -4,12 +4,6 @@ datatype: "{}" default_value: {} --- - - -* `v0.21.0`: `meta` is now a config that can be set in `dbt_project.yml` and as a `config` YAML property for some resource types. It is applied hierarchically and merges on a per-key basis. - - - } + config: + meta: {} tables: - name: table_name - meta: {} + config: + meta: {} columns: - name: column_name @@ -152,7 +148,6 @@ macros: arguments: - name: argument_name - meta: {} ``` diff --git a/website/docs/reference/resource-configs/persist_docs.md b/website/docs/reference/resource-configs/persist_docs.md index 6facf3945cb..15b1e0bdb40 100644 --- a/website/docs/reference/resource-configs/persist_docs.md +++ b/website/docs/reference/resource-configs/persist_docs.md @@ -112,13 +112,6 @@ column and relation comments in the database. By default, documentation persistence is disabled, but it can be enabled for specific resources or groups of resources as needed. - - - - Support for this config on Redshift, Postgres, and Snowflake is new in 0.17.0 - - Support for column-level docs persistence is new for all databases in 0.17.0 - - - ## Support The `persist_docs` config is supported on the most widely used dbt adapters: @@ -151,12 +144,6 @@ Some known issues and limitations: - - -- Column names that must be quoted, such as column names containing special characters, will cause runtime errors if column-level `persist_docs` is enabled. This is fixed in v1.2. - - - diff --git a/website/docs/reference/resource-configs/plus-prefix.md b/website/docs/reference/resource-configs/plus-prefix.md index d8c54aa8e70..c1adbc0286a 100644 --- a/website/docs/reference/resource-configs/plus-prefix.md +++ b/website/docs/reference/resource-configs/plus-prefix.md @@ -5,7 +5,7 @@ title: Using the + prefix The `+` prefix is a dbt syntax feature, introduced in dbt v0.17.0, which helps disambiguate between [resource paths](/reference/resource-configs/resource-path) and configs in `dbt_project.yml` files. -It is only compatible with `dbt_project.yml` files that use [`config-version](/reference/project-configs/config-version): 2` +It is not compatible with `dbt_project.yml` files that use [`config-version`](/reference/project-configs/config-version) 1. For example: diff --git a/website/docs/reference/resource-configs/postgres-configs.md b/website/docs/reference/resource-configs/postgres-configs.md index eb9108ad431..97a695ee12e 100644 --- a/website/docs/reference/resource-configs/postgres-configs.md +++ b/website/docs/reference/resource-configs/postgres-configs.md @@ -8,20 +8,25 @@ id: "postgres-configs" In dbt-postgres, the following incremental materialization strategies are supported: + + - `append` (default) -- `merge` - `delete+insert` + -## Performance Optimizations + -### Unlogged +- `append` (default) +- `merge` +- `delete+insert` - + - - **v0.14.1:** Introduced native support for `unlogged` config - +## Performance optimizations + +### Unlogged "Unlogged" tables can be considerably faster than ordinary tables, as they are not written to the write-ahead log nor replicated to read replicas. They are also considerably less safe than ordinary tables. See [Postgres docs](https://www.postgresql.org/docs/current/sql-createtable.html#SQL-CREATETABLE-UNLOGGED) for details. @@ -48,13 +53,7 @@ models: While Postgres works reasonably well for datasets smaller than about 10m rows, database tuning is sometimes required. It's important to create indexes for columns that are commonly used in joins or where clauses. - - - - **v0.20.0:** Introduced native support for `indexes` config - - - -Table models, incremental models, seeds, and snapshots may have a list of `indexes` defined. Each Postgres index can have three components: +Table models, incremental models, seeds, snapshots, and materialized views may have a list of `indexes` defined. Each Postgres index can have three components: - `columns` (list, required): one or more columns on which the index is defined - `unique` (boolean, optional): whether the index should be [declared unique](https://www.postgresql.org/docs/9.4/indexes-unique.html) - `type` (string, optional): a supported [index type](https://www.postgresql.org/docs/current/indexes-types.html) (B-tree, Hash, GIN, etc) @@ -107,45 +106,35 @@ models: -## Materialized view +## Materialized views -The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html) and refreshes them for every subsequent `dbt run` you execute. For more information, see [Refresh Materialized Views](https://www.postgresql.org/docs/15/sql-refreshmaterializedview.html) in the Postgres docs. +The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html). +Indexes are the only configuration that is specific to `dbt-postgres`. +The remaining configuration follows the general [materialized view](/docs/build/materializations#materialized-view) configuration. +There are also some limitations that we hope to address in the next version. -Materialized views support the optional configuration `on_configuration_change` with the following values: -- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The following index action can be applied without the need to rebuild the materialized view: - - Added - - Dropped - - Updated -- `skip` — allows runs to continue while also providing a warning that the model was skipped -- `fail` — forces runs to fail if a change is detected in a materialized view +### Monitored configuration changes -You can create a materialized view by editing _one_ of these files: -- the SQL file for your model -- the `dbt_project.yml` configuration file +The settings below are monitored for changes applicable to `on_configuration_change`. -The following examples create a materialized view: +#### Indexes - +Index changes (`CREATE`, `DROP`) can be applied without the need to rebuild the materialized view. +This differs from a table model, where the table needs to be dropped and re-created to update the indexes. +If the `indexes` portion of the `config` block is updated, the changes will be detected and applied +directly to the materialized view in place. -```sql -{{ - config( - materialized = 'materialized_view', - on_configuration_change = 'apply', - ) -}} -``` +### Limitations - +#### Changing materialization to and from "materialized_view" +Swapping an already materialized model to a materialized view, and vice versa, is not supported. +The workaround is to manually drop the existing materialization in the data warehouse prior to calling `dbt run`. +Running with `--full-refresh` flag will not work to drop the existing table or view and create the materialized view (and vice versa). +This would only need to be done once as the existing object would then be a materialized view. - - -```yaml -models: - path: - materialized: materialized_view -``` - +For example,`my_model`, has already been materialized as a table in the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="materialized_view"`, they will get an error. +The solution is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. diff --git a/website/docs/reference/resource-configs/pre-hook-post-hook.md b/website/docs/reference/resource-configs/pre-hook-post-hook.md index 1660c50049b..297d6975d6f 100644 --- a/website/docs/reference/resource-configs/pre-hook-post-hook.md +++ b/website/docs/reference/resource-configs/pre-hook-post-hook.md @@ -115,13 +115,6 @@ Pre- and post-hooks can also call macros that return SQL statements. If your mac dbt aims to provide all the boilerplate SQL you need (DDL, DML, and DCL) via out-of-the-box functionality, which you can configure quickly and concisely. In some cases, there may be SQL that you want or need to run, specific to functionality in your data platform, which dbt does not (yet) offer as a built-in feature. In those cases, you can write the exact SQL you need, using dbt's compilation context, and pass it into a `pre-` or `post-` hook to run before or after your model, seed, or snapshot. - - -* `v0.12.2`: The `post_hook` alias for config blocks was introduced. Prior to this, users needed to use the alternative config syntax to apply pre- and post-hooks. - - - - ## Examples @@ -167,69 +160,6 @@ See: [Apache Spark docs on `ANALYZE TABLE`](https://spark.apache.org/docs/latest - - -### Grant privileges on a model - - - -```yml - -models: - +post-hook: "grant select on {{ this }} to group reporter" - -``` - - - -### Grant multiple privileges on a model - - - -```yml - -models: - +post-hook: - - "grant select on {{ this }} to group reporter" - - "grant select on {{ this }} to group transformer" - -``` - - - -### Call a macro to grant privileges on a model - - - -```yml - -models: - +post-hook: "{{ grant_select(this) }}" - -``` - - - - -### Grant privileges on a directory of models - - - -```yml - -models: - jaffle_shop: # this is the project name - marts: - marketing: - # this will be applied to all models in marts/marketing/ - +post-hook: "{{ grant_select(this) }}" - -``` - - - - - ### Additional examples We've compiled some more in-depth examples [here](/docs/build/hooks-operations#additional-examples). diff --git a/website/docs/reference/resource-configs/redshift-configs.md b/website/docs/reference/resource-configs/redshift-configs.md index a0ebf7e88df..9bd127a1e1a 100644 --- a/website/docs/reference/resource-configs/redshift-configs.md +++ b/website/docs/reference/resource-configs/redshift-configs.md @@ -14,17 +14,28 @@ To-do: In dbt-redshift, the following incremental materialization strategies are supported: + + +- `append` (default) +- `delete+insert` + + + + + - `append` (default) - `merge` - `delete+insert` -All of these strategies are inheirited via from dbt-postgres. + + +All of these strategies are inherited from dbt-postgres. ## Performance optimizations ### Using sortkey and distkey -Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect for models set to `view` or `ephemeral` models. +Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect on models set to `view` or `ephemeral` models. - `dist` can have a setting of `all`, `even`, `auto`, or the name of a key. - `sort` accepts a list of sort keys, for example: `['timestamp', 'userid']`. dbt will build the sort key in the same order the fields are supplied. @@ -64,7 +75,7 @@ For more information on distkeys and sortkeys, view Amazon's docs: - [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing a Data Distribution Style](https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html) - [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing Sort Keys](https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html) -## Late Binding Views +## Late binding views Redshift supports views unbound from their dependencies, or [late binding views](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_VIEW.html#late-binding-views). This DDL option "unbinds" a view from the data it selects from. In practice, this means that if upstream views or tables are dropped with a cascade qualifier, the late-binding view does not get dropped as well. @@ -98,42 +109,51 @@ models: -## Materialized view +## Materialized views -The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html) and refreshes them for every subsequent `dbt run` that you execute. For more information, see [Refresh Materialized Views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) in the Redshift docs. +The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html). +Redshift-specific configuration includes the typical `dist`, `sort_type`, `sort`, and `backup`. +For materialized views, there is also the `auto_refresh` setting, which allows Redshift to [automatically refresh](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) the materialized view for you. +The remaining configuration follows the general [materialized view](/docs/build/materializations#Materialized-View) configuration. +There are also some limitations that we hope to address in the next version. -Materialized views support the optional configuration `on_configuration_change` with the following values: -- `apply` (default) — attempts to update the existing database object if possible, avoiding a complete rebuild. The `auto_refresh` action can applied without the need to rebuild the materialized view. -- `skip` — allows runs to continue while also providing a warning that the model was skipped -- `fail` — forces runs to fail if a change is detected in a materialized view +### Monitored configuration changes -You can create a materialized view by editing _one_ of these files: -- the SQL file for your model -- the `dbt_project.yml` configuration file +The settings below are monitored for changes applicable to `on_configuration_change`. -The following examples create a materialized view: +#### Dist - +Changes to `dist` will result in a full refresh of the existing materialized view (applied at the time of the next `dbt run` of the model). Redshift requires a materialized view to be +dropped and recreated to apply a change to the `distkey` or `diststyle`. -```sql -{{ - config( - materialized = 'materialized_view', - on_configuration_change = 'apply', - ) -}} -``` +#### Sort type, sort - +Changes to `sort_type` or `sort` will result in a full refresh. Redshift requires a materialized +view to be dropped and recreated to apply a change to the `sortkey` or `sortstyle`. +#### Backup - +Changes to `backup` will result in a full refresh. Redshift requires a materialized +view to be dropped and recreated to apply a change to the `backup` setting. -```yaml -models: - path: - materialized: materialized_view -``` - +#### Auto refresh + +The `auto_refresh` setting can be updated via an `ALTER` statement. This setting effectively toggles +automatic refreshes on or off. The default setting for this config is off (`False`). If this +is the only configuration change for the materialized view, dbt will choose to apply +an `ALTER` statement instead of issuing a full refresh, + +### Limitations + +#### Changing materialization from "materialized_view" to "table" or "view" + +Swapping a materialized view to a table or view is not supported. +You must manually drop the existing materialized view in the data warehouse prior to calling `dbt run`. +Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case. +This would only need to be done once as the existing object would then be a materialized view. + +For example, assume that a materialized view, `my_mv.sql`, has already been materialized to the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="table"`, they will get an error. +The workaround is to execute `DROP MATERIALIZED VIEW my_mv CASCADE` on the data warehouse before trying the model again. diff --git a/website/docs/reference/resource-configs/severity.md b/website/docs/reference/resource-configs/severity.md index c89c6db0716..25bab9647d6 100644 --- a/website/docs/reference/resource-configs/severity.md +++ b/website/docs/reference/resource-configs/severity.md @@ -6,14 +6,6 @@ resource_types: [tests] datatype: string --- - - -* `v0.14.0`: Introduced `severity` config -* `v0.20.0`: Introduced `error_if` + `warn_if` configs. Enabled configuration of tests from `dbt_project.yml` -* `v0.21.0`: Introduced `config` property for tests - - - Tests return a number of failures—most often, this is the count of rows returned by the test query, but it could be a [custom calculation](/reference/resource-configs/fail_calc). Generally, if the number of failures is nonzero, the test returns an error. This makes sense, as test queries are designed to return all the rows you _don't_ want: duplicate records, null values, etc. It's possible to configure tests to return warnings instead of errors, or to make the test status conditional on the number of failures returned. Maybe 1 duplicate record can count as a warning, but 10 duplicate records should count as an error. diff --git a/website/docs/reference/resource-configs/singlestore-configs.md b/website/docs/reference/resource-configs/singlestore-configs.md index f503779f0fc..0c93d557a8b 100644 --- a/website/docs/reference/resource-configs/singlestore-configs.md +++ b/website/docs/reference/resource-configs/singlestore-configs.md @@ -3,13 +3,6 @@ title: "SingleStore configurations" id: "singlestore-configs" --- - - - - - **v1.1.2:** Added support for for `storage_type`, `indexes`, `primary_key`, `sort_key`, `shard_key`, `unique_table_key`, `charset`, `collation` options for creating SingleStore tables. - - - ## Performance Optimizations [SingleStore Physical Database Schema Design documentation](https://docs.singlestore.com/managed-service/en/create-a-database/physical-database-schema-design/concepts-of-physical-database-schema-design.html) is helpful if you want to use specific options (that are described below) in your dbt project. diff --git a/website/docs/reference/resource-configs/snowflake-configs.md b/website/docs/reference/resource-configs/snowflake-configs.md index 42ee3635089..30c7966ec68 100644 --- a/website/docs/reference/resource-configs/snowflake-configs.md +++ b/website/docs/reference/resource-configs/snowflake-configs.md @@ -77,7 +77,7 @@ select ... ``` -In this example, you can set up a query tag to be applied to every query with the model's name. +In this example, you can set up a query tag to be applied to every query with the model's name. ```sql @@ -301,7 +301,7 @@ models: -## Temporary Tables +## Temporary tables Beginning in dbt version 1.3, incremental table merges for Snowflake prefer to utilize a `view` rather than a `temporary table`. The reasoning was to avoid the database write step that a temporary table would initiate and save compile time. @@ -341,3 +341,99 @@ In the configuration format for the model SQL file: + + + +## Dynamic tables + +The Snowflake adapter supports [dynamic tables](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table). +This materialization is specific to Snowflake, which means that any model configuration that +would normally come along for the ride from `dbt-core` (e.g. as with a `view`) may not be available +for dynamic tables. This gap will decrease in future patches and versions. +While this materialization is specific to Snowflake, it very much follows the implementation +of [materialized views](/docs/build/materializations#Materialized-View). +In particular, dynamic tables have access to the `on_configuration_change` setting. +There are also some limitations that we hope to address in the next version. + +### Parameters + +Dynamic tables in `dbt-snowflake` require the following parameters: +- `target_lag` +- `snowflake_warehouse` +- `on_configuration_change` + +To learn more about each parameter and what values it can take, see +the Snowflake docs page: [`CREATE DYNAMIC TABLE: Parameters`](https://docs.snowflake.com/en/sql-reference/sql/create-dynamic-table). + +### Usage + +You can create a dynamic table by editing _one_ of these files: + +- the SQL file for your model +- the `dbt_project.yml` configuration file + +The following examples create a dynamic table: + + + +```sql +{{ config( + materialized = 'dynamic_table', + snowflake_warehouse = 'snowflake_warehouse', + target_lag = '10 minutes', +) }} +``` + + + + + +```yaml +models: + path: + materialized: dynamic_table + snowflake_warehouse: snowflake_warehouse + target_lag: '10 minutes' +``` + + + +### Monitored configuration changes + +The settings below are monitored for changes applicable to `on_configuration_change`. + +#### Target lag + +Changes to `target_lag` can be applied by running an `ALTER` statement. Refreshing is essentially +always on for dynamic tables; this setting changes how frequently the dynamic table is updated. + +#### Warehouse + +Changes to `snowflake_warehouse` can be applied via an `ALTER` statement. + +### Limitations + +#### Changing materialization to and from "dynamic_table" + +Swapping an already materialized model to be a dynamic table and vice versa. +The workaround is manually dropping the existing materialization in the data warehouse prior to calling `dbt run`. +Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case. +This would only need to be done once as the existing object would then be a dynamic table. + +For example, assume for the example model below, `my_model`, has already been materialized to the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="dynamic_table"`, they will get an error. +The workaround is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. + + + +```yaml + +{{ config( + materialized="table" # or any model type eg view, incremental +) }} + +``` + + + + diff --git a/website/docs/reference/resource-configs/spark-configs.md b/website/docs/reference/resource-configs/spark-configs.md index 95a853107f6..ce3b317f0f1 100644 --- a/website/docs/reference/resource-configs/spark-configs.md +++ b/website/docs/reference/resource-configs/spark-configs.md @@ -29,12 +29,6 @@ When materializing a model as `table`, you may include several optional configs ## Incremental models - - - - `dbt-spark==0.19.0`: Added the `append` strategy as default for all platforms, file types, and connection methods. - - - dbt seeks to offer useful, intuitive modeling abstractions by means of its built-in configurations and materializations. Because there is so much variance between Apache Spark clusters out in the world—not to mention the powerful features offered to Databricks users by the Delta file format and custom runtime—making sense of all the available options is an undertaking in its own right. Alternatively, you can use Apache Iceberg or Apache Hudi file format with Apache Spark runtime for building incremental models. @@ -192,13 +186,6 @@ insert overwrite table analytics.spark_incremental ### The `merge` strategy - - - - `dbt-spark==0.15.3`: Introduced `merge` incremental strategy - - - - **Usage notes:** The `merge` incremental strategy requires: - `file_format: delta, iceberg or hudi` - Databricks Runtime 5.1 and above for delta file format @@ -294,12 +281,6 @@ or `show table extended in [database] like '*'`. ## Always `schema`, never `database` - - - - `dbt-spark==0.17.0` ended use of `database` in all cases. - - - Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands `database` to exist at a higher level than `schema`. As such, you should _never_ use or set `database` as a node config or in the target profile when running dbt-spark. diff --git a/website/docs/reference/resource-configs/store_failures.md b/website/docs/reference/resource-configs/store_failures.md index 62ae33ba713..3c965179211 100644 --- a/website/docs/reference/resource-configs/store_failures.md +++ b/website/docs/reference/resource-configs/store_failures.md @@ -3,13 +3,6 @@ resource_types: [tests] datatype: boolean --- - - -* `v0.20.0`: Introduced `store_failures` config and functionality -* `v0.21.0`: Introduced `config` property for tests - - - The configured test(s) will store their failures when `dbt test --store-failures` is invoked. ## Description diff --git a/website/docs/reference/resource-configs/upsolver-configs.md b/website/docs/reference/resource-configs/upsolver-configs.md new file mode 100644 index 00000000000..b917ee2cc58 --- /dev/null +++ b/website/docs/reference/resource-configs/upsolver-configs.md @@ -0,0 +1,464 @@ +--- +title: "Upsolver configurations" +id: "upsolver-configs" +description: "Upsolver Configurations - Read this in-depth guide to learn about configurations in dbt." +--- + +## Supported Upsolver SQLake functionality + +| COMMAND | STATE | MATERIALIZED | +| ------ | ------ | ------ | +| SQL compute cluster| not supported | - | +| SQL connections| supported | connection | +| SQL copy job | supported | incremental | +| SQL merge job | supported | incremental | +| SQL insert job | supported | incremental | +| SQL materialized views | supported | materializedview | +| Expectations | supported | incremental | + +## Configs materialization + +| Config | Required | Materialization | Description | Example | +| ------ | --------- | --------------- | ---------- | ------- | +| connection_type | Yes | connection | Connection identifier: S3/GLUE_CATALOG/KINESIS | connection_type='S3' | +| connection_options | Yes | connection | Dictionary of options supported by selected connection | connection_options={ 'aws_role': 'aws_role', 'external_id': 'SAMPLES', 'read_only': True } | +| incremental_strategy | No | incremental | Define one of incremental strategies: merge/copy/insert. Default: copy | incremental_strategy='merge' | +| source | No | incremental | Define source to copy from: S3/KAFKA/KINESIS | source = 'S3' | +| target_type | No | incremental | Define target type REDSHIFT/ELASTICSEARCH/S3/SNOWFLAKE/POSTGRES. Default None for Data lake | target_type='Snowflake' | +| target_prefix | False | incremental | Define PREFIX for ELASTICSEARCH target type | target_prefix = 'orders' | +| target_location | False | incremental | Define LOCATION for S3 target type | target_location = 's3://your-bucket-name/path/to/folder/' | +| schema | Yes/No | incremental | Define target schema. Required if target_type, no table created in a metastore connection | schema = 'target_schema' | +| database | Yes/No | incremental | Define target connection. Required if target_type, no table created in a metastore connection | database = 'target_connection' | +| alias | Yes/No | incremental | Define target table. Required if target_type, no table created in a metastore connection | alias = 'target_table' | +| delete_condition | No | incremental | Records that match the ON condition and a delete condition can be deleted | delete_condition='nettotal > 1000' | +| partition_by | No | incremental | List of dictionaries to define partition_by for target metastore table | partition_by=[{'field':'$field_name'}] | +| primary_key | No | incremental | List of dictionaries to define partition_by for target metastore table | primary_key=[{'field':'customer_email', 'type':'string'}] | +| map_columns_by_name | No | incremental | Maps columns from the SELECT statement to the table. Boolean. Default: False | map_columns_by_name=True | +| sync | No | incremental/materializedview | Boolean option to define if job is synchronized or non-msynchronized. Default: False | sync=True | +| options | No | incremental/materializedview | Dictionary of job options | options={ 'START_FROM': 'BEGINNING', 'ADD_MISSING_COLUMNS': True } | + +## SQL connection + +Connections are used to provide Upsolver with the proper credentials to bring your data into SQLake as well as to write out your transformed data to various services. More details on ["Upsolver SQL connections"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-connections) +As a dbt model connection is a model with materialized='connection' + +```sql +{{ config( + materialized='connection', + connection_type={ 'S3' | 'GLUE_CATALOG' | 'KINESIS' | 'KAFKA'| 'SNOWFLAKE' }, + connection_options={} + ) +}} +``` + +Running this model will compile CREATE CONNECTION(or ALTER CONNECTION if exists) SQL and send it to Upsolver engine. Name of the connection will be name of the model. + +## SQL copy job + +A COPY FROM job allows you to copy your data from a given source into a table created in a metastore connection. This table then serves as your staging table and can be used with SQLake transformation jobs to write to various target locations. More details on ["Upsolver SQL copy-from"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/copy-from) + +As a dbt model copy job is model with materialized='incremental' + +```sql +{{ config( materialized='incremental', + sync=True|False, + source = 'S3'| 'KAFKA' | ... , + options={ + 'option_name': 'option_value' + }, + partition_by=[{}] + ) +}} +SELECT * FROM {{ ref() }} +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake (or ALTER TABLE if exists) and CREATE COPY JOB(or ALTER COPY JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL insert job + +An INSERT job defines a query that pulls in a set of data based on the given SELECT statement and inserts it into the designated target. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL insert"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/insert). + +As a dbt model insert job is model with materialized='incremental' and incremental_strategy='insert' + +```sql +{{ config( materialized='incremental', + sync=True|False, + map_columns_by_name=True|False, + incremental_strategy='insert', + options={ + 'option_name': 'option_value' + }, + primary_key=[{}] + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +HAVING COUNT(DISTINCT orderid::string) ... +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE INSERT JOB(or ALTER INSERT JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL merge job + +A MERGE job defines a query that pulls in a set of data based on the given SELECT statement and inserts into, replaces, or deletes the data from the designated target based on the job definition. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL merge"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/merge). + +As a dbt model merge job is model with materialized='incremental' and incremental_strategy='merge' + +```sql +{{ config( materialized='incremental', + sync=True|False, + map_columns_by_name=True|False, + incremental_strategy='merge', + options={ + 'option_name': 'option_value' + }, + primary_key=[{}] + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +HAVING COUNT ... +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE MERGE JOB(or ALTER MERGE JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL materialized views + +When transforming your data, you may find that you need data from multiple source tables in order to achieve your desired result. +In such a case, you can create a materialized view from one SQLake table in order to join it with your other table (which in this case is considered the main table). More details on ["Upsolver SQL materialized views"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/sql-materialized-views). + +As a dbt model materialized views is model with materialized='materializedview'. + +```sql +{{ config( materialized='materializedview', + sync=True|False, + options={'option_name': 'option_value'} + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +``` + +Running this model will compile CREATE MATERIALIZED VIEW SQL(or ALTER MATERIALIZED VIEW if exists) and send it to Upsolver engine. Name of the materializedview will be name of the model. + +## Expectations/constraints + +Data quality conditions can be added to your job to drop a row or trigger a warning when a column violates a predefined condition. + +```sql +WITH EXPECTATION EXPECT +ON VIOLATION WARN +``` + +Expectations can be implemented with dbt constraints +Supported constraints: check and not_null + +```yaml +models: + - name: + # required + config: + contract: + enforced: true + # model-level constraints + constraints: + - type: check + columns: ['', ''] + expression: "column1 <= column2" + name: + - type: not_null + columns: ['column1', 'column2'] + name: + + columns: + - name: + data_type: string + + # column-level constraints + constraints: + - type: not_null + - type: check + expression: "REGEXP_LIKE(, '^[0-9]{4}[a-z]{5}$')" + name: +``` + +## Projects examples + +> projects examples link: [github.com/dbt-upsolver/examples/](https://github.com/Upsolver/dbt-upsolver/tree/main/examples) + +## Connection options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| aws_role | s3 | True | True | 'aws_role': `''` | +| external_id | s3 | True | True | 'external_id': `''` | +| aws_access_key_id | s3 | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | s3 | True | True | 'aws_secret_access_key_id': `''` | +| path_display_filter | s3 | True | True | 'path_display_filter': `''` | +| path_display_filters | s3 | True | True | 'path_display_filters': (`''`, ...) | +| read_only | s3 | True | True | 'read_only': True/False | +| encryption_kms_key | s3 | True | True | 'encryption_kms_key': `''` | +| encryption_customer_managed_key | s3 | True | True | 'encryption_customer_kms_key': `''` | +| comment | s3 | True | True | 'comment': `''` | +| host | kafka | False | False | 'host': `''` | +| hosts | kafka | False | False | 'hosts': (`''`, ...) | +| consumer_properties | kafka | True | True | 'consumer_properties': `''` | +| version | kafka | False | True | 'version': `''` | +| require_static_ip | kafka | True | True | 'require_static_ip': True/False | +| ssl | kafka | True | True | 'ssl': True/False | +| topic_display_filter | kafka | True | True | 'topic_display_filter': `''` | +| topic_display_filters | kafka | True | True | 'topic_display_filter': (`''`, ...) | +| comment | kafka | True | True | 'comment': `''` | +| aws_role | glue_catalog | True | True | 'aws_role': `''` | +| external_id | glue_catalog | True | True | 'external_id': `''` | +| aws_access_key_id | glue_catalog | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | glue_catalog | True | True | 'aws_secret_access_key': `''` | +| default_storage_connection | glue_catalog | False | False | 'default_storage_connection': `''` | +| default_storage_location | glue_catalog | False | False | 'default_storage_location': `''` | +| region | glue_catalog | False | True | 'region': `''` | +| database_display_filter | glue_catalog | True | True | 'database_display_filter': `''` | +| database_display_filters | glue_catalog | True | True | 'database_display_filters': (`''`, ...) | +| comment | glue_catalog | True | True | 'comment': `''` | +| aws_role | kinesis | True | True | 'aws_role': `''` | +| external_id | kinesis | True | True | 'external_id': `''` | +| aws_access_key_id | kinesis | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | kinesis | True | True | 'aws_secret_access_key': `''` | +| region | kinesis | False | False | 'region': `''` | +| read_only | kinesis | False | True | 'read_only': True/False | +| max_writers | kinesis | True | True | 'max_writers': `` | +| stream_display_filter | kinesis | True | True | 'stream_display_filter': `''` | +| stream_display_filters | kinesis | True | True | 'stream_display_filters': (`''`, ...) | +| comment | kinesis | True | True | 'comment': `''` | +| connection_string | snowflake | True | False | 'connection_string': `''` | +| user_name | snowflake | True | False | 'user_name': `''` | +| password | snowflake | True | False | 'password': `''` | +| max_concurrent_connections | snowflake | True | True | 'max_concurrent_connections': `` | +| comment | snowflake | True | True | 'comment': `''` | +| connection_string | redshift | True | False | 'connection_string': `''` | +| user_name | redshift | True | False | 'user_name': `''` | +| password | redshift | True | False | 'password': `''` | +| max_concurrent_connections | redshift | True | True | 'max_concurrent_connections': `` | +| comment | redshift | True | True | 'comment': `''` | +| connection_string | mysql | True | False | 'connection_string': `''` | +| user_name | mysql | True | False | 'user_name': `''` | +| password | mysql | True | False | 'password': `''` | +| comment | mysql | True | True | 'comment': `''` | +| connection_string | postgres | True | False | 'connection_string': `''` | +| user_name | postgres | True | False | 'user_name': `''` | +| password | postgres | True | False | 'password': `''` | +| comment | postgres | True | True | 'comment': `''` | +| connection_string | elasticsearch | True | False | 'connection_string': `''` | +| user_name | elasticsearch | True | False | 'user_name': `''` | +| password | elasticsearch | True | False | 'password': `''` | +| comment | elasticsearch | True | True | 'comment': `''` | +| connection_string | mongodb | True | False | 'connection_string': `''` | +| user_name | mongodb | True | False | 'user_name': `''` | +| password | mongodb | True | False | 'password': `''` | +| timeout | mongodb | True | True | 'timeout': "INTERVAL 'N' SECONDS" | +| comment | mongodb | True | True | 'comment': `''` | +| connection_string | mssql | True | False | 'connection_string': `''` | +| user_name | mssql | True | False | 'user_name': `''` | +| password | mssql | True | False | 'password': `''` | +| comment | mssql | True | True | 'comment': `''` | + +## Target options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| globally_unique_keys | datalake | False | True | 'globally_unique_keys': True/False | +| storage_connection | datalake | False | True | 'storage_connection': `''` | +| storage_location | datalake | False | True | 'storage_location': `''` | +| compute_cluster | datalake | True | True | 'compute_cluster': `''` | +| compression | datalake | True | True | 'compression': 'SNAPPY/GZIP' | +| compaction_processes | datalake | True | True | 'compaction_processes': `` | +| disable_compaction | datalake | True | True | 'disable_compaction': True/False | +| retention_date_partition | datalake | False | True | 'retention_date_partition': `''` | +| table_data_retention | datalake | True | True | 'table_data_retention': `''` | +| column_data_retention | datalake | True | True | 'column_data_retention': ({'COLUMN' : `''`,'DURATION': `''`}) | +| comment | datalake | True | True | 'comment': `''` | +| storage_connection | materialized_view | False | True | 'storage_connection': `''` | +| storage_location | materialized_view | False | True | 'storage_location': `''` | +| max_time_travel_duration | materialized_view | True | True | 'max_time_travel_duration': `''` | +| compute_cluster | materialized_view | True | True | 'compute_cluster': `''` | +| column_transformations | snowflake | False | True | 'column_transformations': {`''` : `''` , ...} | +| deduplicate_with | snowflake | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| exclude_columns | snowflake | False | True | 'exclude_columns': (`''`, ...) | +| create_table_if_missing | snowflake | False | True | 'create_table_if_missing': True/False} | +| run_interval | snowflake | False | True | 'run_interval': `''` | + +## Transformation options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| run_interval | s3 | False | True | 'run_interval': `''` | +| start_from | s3 | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | s3 | True | True | 'end_at': `'/NOW'` | +| compute_cluster | s3 | True | True | 'compute_cluster': `''` | +| comment | s3 | True | True | 'comment': `''` | +| skip_validations | s3 | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | s3 | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | s3 | True | True | 'aggregation_parallelism': `` | +| run_parallelism | s3 | True | True | 'run_parallelism': `` | +| file_format | s3 | False | False | 'file_format': '(type = ``)' | +| compression | s3 | False | True | 'compression': 'SNAPPY/GZIP ...' | +| date_pattern | s3 | False | True | 'date_pattern': `''` | +| output_offset | s3 | False | True | 'output_offset': `''` | +| run_interval | elasticsearch | False | True | 'run_interval': `''` | +| routing_field_name | elasticsearch | True | True | 'routing_field_name': `''` | +| start_from | elasticsearch | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | elasticsearch | True | True | 'end_at': `'/NOW'` | +| compute_cluster | elasticsearch | True | True | 'compute_cluster': `''` | +| skip_validations | elasticsearch | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | elasticsearch | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | elasticsearch | True | True | 'aggregation_parallelism': `` | +| run_parallelism | elasticsearch | True | True | 'run_parallelism': `` | +| bulk_max_size_bytes | elasticsearch | True | True | 'bulk_max_size_bytes': `` | +| index_partition_size | elasticsearch | True | True | 'index_partition_size': 'HOURLY/DAILY ...' | +| comment | elasticsearch | True | True | 'comment': `''` | +| custom_insert_expressions | snowflake | True | True | 'custom_insert_expressions': {'INSERT_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} | +| custom_update_expressions | snowflake | True | True | 'custom_update_expressions': {'UPDATE_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} | +| keep_existing_values_when_null | snowflake | True | True | 'keep_existing_values_when_null': True/False | +| add_missing_columns | snowflake | False | True | 'add_missing_columns': True/False | +| run_interval | snowflake | False | True | 'run_interval': `''` | +| commit_interval | snowflake | True | True | 'commit_interval': `''` | +| start_from | snowflake | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | snowflake | True | True | 'end_at': `'/NOW'` | +| compute_cluster | snowflake | True | True | 'compute_cluster': `''` | +| skip_validations | snowflake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | snowflake | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | snowflake | True | True | 'aggregation_parallelism': `` | +| run_parallelism | snowflake | True | True | 'run_parallelism': `` | +| comment | snowflake | True | True | 'comment': `''` | +| add_missing_columns | datalake | False | True | 'add_missing_columns': True/False | +| run_interval | datalake | False | True | 'run_interval': `''` | +| start_from | datalake | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | datalake | True | True | 'end_at': `'/NOW'` | +| compute_cluster | datalake | True | True | 'compute_cluster': `''` | +| skip_validations | datalake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | datalake | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | datalake | True | True | 'aggregation_parallelism': `` | +| run_parallelism | datalake | True | True | 'run_parallelism': `` | +| comment | datalake | True | True | 'comment': `''` | +| run_interval | redshift | False | True | 'run_interval': `''` | +| start_from | redshift | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | redshift | True | True | 'end_at': `'/NOW'` | +| compute_cluster | redshift | True | True | 'compute_cluster': `''` | +| skip_validations | redshift | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | redshift | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | redshift | True | True | 'aggregation_parallelism': `` | +| run_parallelism | redshift | True | True | 'run_parallelism': `` | +| skip_failed_files | redshift | False | True | 'skip_failed_files': True/False | +| fail_on_write_error | redshift | False | True | 'fail_on_write_error': True/False | +| comment | redshift | True | True | 'comment': `''` | +| run_interval | postgres | False | True | 'run_interval': `''` | +| start_from | postgres | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | postgres | True | True | 'end_at': `'/NOW'` | +| compute_cluster | postgres | True | True | 'compute_cluster': `''` | +| skip_validations | postgres | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | postgres | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | postgres | True | True | 'aggregation_parallelism': `` | +| run_parallelism | postgres | True | True | 'run_parallelism': `` | +| comment | postgres | True | True | 'comment': `''` | + +## Copy options + +| Option | Storage | Category | Editable | Optional | Config Syntax | +| -------| ---------- | -------- | -------- | -------- | ------------- | +| topic | kafka | source_options | False | False | 'topic': `''` | +| exclude_columns | kafka | job_options | False | True | 'exclude_columns': (`''`, ...) | +| deduplicate_with | kafka | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| consumer_properties | kafka | job_options | True | True | 'consumer_properties': `''` | +| reader_shards | kafka | job_options | True | True | 'reader_shards': `` | +| store_raw_data | kafka | job_options | False | True | 'store_raw_data': True/False | +| start_from | kafka | job_options | False | True | 'start_from': 'BEGINNING/NOW' | +| end_at | kafka | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | kafka | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | kafka | job_options | True | True | 'run_parallelism': `` | +| content_type | kafka | job_options | True | True | 'content_type': 'AUTO/CSV/...' | +| compression | kafka | job_options | False | True | 'compression': 'AUTO/GZIP/...' | +| column_transformations | kafka | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| commit_interval | kafka | job_options | True | True | 'commit_interval': `''` | +| skip_validations | kafka | job_options | False | True | 'skip_validations': ('MISSING_TOPIC') | +| skip_all_validations | kafka | job_options | False | True | 'skip_all_validations': True/False | +| comment | kafka | job_options | True | True | 'comment': `''` | +| table_include_list | mysql | source_options | True | True | 'table_include_list': (`''`, ...) | +| column_exclude_list | mysql | source_options | True | True | 'column_exclude_list': (`''`, ...) | +| exclude_columns | mysql | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mysql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mysql | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mysql | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mysql | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mysql | job_options | True | True | 'snapshot_parallelism': `` | +| ddl_filters | mysql | job_options | False | True | 'ddl_filters': (`''`, ...) | +| comment | mysql | job_options | True | True | 'comment': `''` | +| table_include_list | postgres | source_options | False | False | 'table_include_list': (`''`, ...) | +| column_exclude_list | postgres | source_options | False | True | 'column_exclude_list': (`''`, ...) | +| heartbeat_table | postgres | job_options | False | True | 'heartbeat_table': `''` | +| skip_snapshots | postgres | job_options | False | True | 'skip_snapshots': True/False | +| publication_name | postgres | job_options | False | False | 'publication_name': `''` | +| end_at | postgres | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | postgres | job_options | True | True | 'compute_cluster': `''` | +| comment | postgres | job_options | True | True | 'comment': `''` | +| parse_json_columns | postgres | job_options | False | False | 'parse_json_columns': True/False | +| column_transformations | postgres | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| snapshot_parallelism | postgres | job_options | True | True | 'snapshot_parallelism': `` | +| exclude_columns | postgres | job_options | False | True | 'exclude_columns': (`''`, ...) | +| location | s3 | source_options | False | False | 'location': `''` | +| date_pattern | s3 | job_options | False | True | 'date_pattern': `''` | +| file_pattern | s3 | job_options | False | True | 'file_pattern': `''` | +| initial_load_pattern | s3 | job_options | False | True | 'initial_load_pattern': `''` | +| initial_load_prefix | s3 | job_options | False | True | 'initial_load_prefix': `''` | +| delete_files_after_load | s3 | job_options | False | True | 'delete_files_after_load': True/False | +| deduplicate_with | s3 | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| end_at | s3 | job_options | True | True | 'end_at': `'/NOW'` | +| start_from | s3 | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` | +| compute_cluster | s3 | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | s3 | job_options | True | True | 'run_parallelism': `` | +| content_type | s3 | job_options | True | True | 'content_type': 'AUTO/CSV...' | +| compression | s3 | job_options | False | True | 'compression': 'AUTO/GZIP...' | +| comment | s3 | job_options | True | True | 'comment': `''` | +| column_transformations | s3 | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| commit_interval | s3 | job_options | True | True | 'commit_interval': `''` | +| skip_validations | s3 | job_options | False | True | 'skip_validations': ('EMPTY_PATH') | +| skip_all_validations | s3 | job_options | False | True | 'skip_all_validations': True/False | +| exclude_columns | s3 | job_options | False | True | 'exclude_columns': (`''`, ...) | +| stream | kinesis | source_options | False | False | 'stream': `''` | +| reader_shards | kinesis | job_options | True | True | 'reader_shards': `` | +| store_raw_data | kinesis | job_options | False | True | 'store_raw_data': True/False | +| start_from | kinesis | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | kinesis | job_options | False | True | 'end_at': `'/NOW'` | +| compute_cluster | kinesis | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | kinesis | job_options | False | True | 'run_parallelism': `` | +| content_type | kinesis | job_options | True | True | 'content_type': 'AUTO/CSV...' | +| compression | kinesis | job_options | False | True | 'compression': 'AUTO/GZIP...' | +| comment | kinesis | job_options | True | True | 'comment': `''` | +| column_transformations | kinesis | job_options | True | True | 'column_transformations': {`''` : `''` , ...} | +| deduplicate_with | kinesis | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| commit_interval | kinesis | job_options | True | True | 'commit_interval': `''` | +| skip_validations | kinesis | job_options | False | True | 'skip_validations': ('MISSING_STREAM') | +| skip_all_validations | kinesis | job_options | False | True | 'skip_all_validations': True/False | +| exclude_columns | kinesis | job_options | False | True | 'exclude_columns': (`''`, ...) | +| table_include_list | mssql | source_options | True | True | 'table_include_list': (`''`, ...) | +| column_exclude_list | mssql | source_options | True | True | 'column_exclude_list': (`''`, ...) | +| exclude_columns | mssql | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mssql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mssql | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mssql | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mssql | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mssql | job_options | True | True | 'snapshot_parallelism': `` | +| parse_json_columns | mssql | job_options | False | False | 'parse_json_columns': True/False | +| comment | mssql | job_options | True | True | 'comment': `''` | +| collection_include_list | mongodb | source_options | True | True | 'collection_include_list': (`''`, ...) | +| exclude_columns | mongodb | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mongodb | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mongodb | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mongodb | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mongodb | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mongodb | job_options | True | True | 'snapshot_parallelism': `` | +| comment | mongodb | job_options | True | True | 'comment': `''` | diff --git a/website/docs/reference/resource-configs/where.md b/website/docs/reference/resource-configs/where.md index b0953e6f3d4..dbb3b66e901 100644 --- a/website/docs/reference/resource-configs/where.md +++ b/website/docs/reference/resource-configs/where.md @@ -3,13 +3,6 @@ resource_types: [tests] datatype: string --- - - -* `v0.20.0`: Introduced `where` config -* `v0.21.0`: Introduced `config` property for tests. Reimplemented `where` config with `get_where_subquery` macro - - - ### Definition Filter the resource being tested (model, source, seed, or snapshot). diff --git a/website/docs/reference/resource-properties/config.md b/website/docs/reference/resource-properties/config.md index 32143c1da07..e6021def852 100644 --- a/website/docs/reference/resource-properties/config.md +++ b/website/docs/reference/resource-properties/config.md @@ -108,13 +108,6 @@ version: 2 - - -We have added support for the `config` property on sources in dbt Core v1.1 - - - - @@ -133,8 +126,6 @@ sources: - - diff --git a/website/docs/reference/resource-properties/constraints.md b/website/docs/reference/resource-properties/constraints.md index b25893729e5..4e500ed64ea 100644 --- a/website/docs/reference/resource-properties/constraints.md +++ b/website/docs/reference/resource-properties/constraints.md @@ -20,7 +20,7 @@ Constraints require the declaration and enforcement of a model [contract](/refer Constraints may be defined for a single column, or at the model level for one or more columns. As a general rule, we recommend defining single-column constraints directly on those columns. The structure of a constraint is: -- `type` (required): one of `not_null`, `primary_key`, `foreign_key`, `check`, `custom` +- `type` (required): one of `not_null`, `unique`, `primary_key`, `foreign_key`, `check`, `custom` - `expression`: Free text input to qualify the constraint. Required for certain constraint types, and optional for others. - `name` (optional): Human-friendly name for this constraint. Supported by some data platforms. - `columns` (model-level only): List of column names to apply the constraint over @@ -53,6 +53,9 @@ models: # column-level constraints constraints: - type: not_null + - type: unique + - type: foreign_key + expression: . () - type: ... ``` @@ -228,7 +231,7 @@ select Snowflake suppports four types of constraints: `unique`, `not null`, `primary key` and `foreign key`. It is important to note that only the `not null` (and the `not null` property of `primary key`) are actually checked today. -There rest of the constraints are purely metadata, not verified when inserting data. +The rest of the constraints are purely metadata, not verified when inserting data. Currently, Snowflake doesn't support the `check` syntax and dbt will skip the `check` config and raise a warning message if it is set on some models in the dbt project. @@ -350,7 +353,62 @@ models: -Expected DDL to enforce constraints: +### Column-level constraint on nested column: + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 'string' as a, + struct( + 1 as id, + 'name' as name, + struct(2 as id, struct('test' as again, '2' as even_more) as another) as double_nested + ) as b +``` + + + + + +```yml +version: 2 + +models: + - name: nested_column_constraints_example + config: + contract: + enforced: true + columns: + - name: a + data_type: string + - name: b.id + data_type: integer + constraints: + - type: not_null + - name: b.name + description: test description + data_type: string + - name: b.double_nested.id + data_type: integer + - name: b.double_nested.another.again + data_type: string + - name: b.double_nested.another.even_more + data_type: integer + constraints: + - type: not_null +``` + + + +### Expected DDL to enforce constraints: + ```sql diff --git a/website/docs/reference/resource-properties/freshness.md b/website/docs/reference/resource-properties/freshness.md index ae39a764cc1..f332f5a1b8f 100644 --- a/website/docs/reference/resource-properties/freshness.md +++ b/website/docs/reference/resource-properties/freshness.md @@ -88,13 +88,6 @@ This is particularly useful if: - You are using Snowflake, Databricks or Spark with large tables, and this results in a performance benefit - - -* `v0.15.0`: This property was introduced - - - - ## Examples ### Complete example diff --git a/website/docs/reference/resource-properties/quote.md b/website/docs/reference/resource-properties/quote.md index 3552d1d3d3a..50bf4c08c40 100644 --- a/website/docs/reference/resource-properties/quote.md +++ b/website/docs/reference/resource-properties/quote.md @@ -115,12 +115,6 @@ analyses: ## Definition The `quote` field can be used to enable or disable quoting for column names. - - -* `v0.16.0`: This configuration was added - - - ## Default The default quoting value is `false` diff --git a/website/docs/reference/resource-properties/tests.md b/website/docs/reference/resource-properties/tests.md index f25e5306542..6e2c02c6bc5 100644 --- a/website/docs/reference/resource-properties/tests.md +++ b/website/docs/reference/resource-properties/tests.md @@ -300,8 +300,6 @@ models: Check out the guide on writing a [custom generic test](/guides/best-practices/writing-custom-generic-tests) for more information. - - ### Custom test name By default, dbt will synthesize a name for your generic test by concatenating: @@ -438,10 +436,6 @@ $ dbt test **If using [`store_failures`](/reference/resource-configs/store_failures):** dbt uses each test's name as the name of the table in which to store any failing records. If you have defined a custom name for one test, that custom name will also be used for its table of failures. You may optionally configure an [`alias`](/reference/resource-configs/alias) for the test, to separately control both the name of the test (for metadata) and the name of its database table (for storing failures). - - - - ### Alternative format for defining tests When defining a generic test with several arguments and configurations, the YAML can look and feel unwieldy. If you find it easier, you can define the same test properties as top-level keys of a single dictionary, by providing the test name as `test_name` instead. It's totally up to you. @@ -470,5 +464,3 @@ models: ``` - - diff --git a/website/docs/reference/resource-properties/versions.md b/website/docs/reference/resource-properties/versions.md index 7e107ff31e3..86e9abf34a8 100644 --- a/website/docs/reference/resource-properties/versions.md +++ b/website/docs/reference/resource-properties/versions.md @@ -2,8 +2,12 @@ resource_types: [models] datatype: list required: no +keyword: governance, model version, model versioning, dbt model versioning --- +import VersionsCallout from '/snippets/_version-callout.md'; + + @@ -61,3 +65,62 @@ Note that the value of `defined_in` and the `alias` configuration of a model are - Follow a consistent naming convention for model versions and aliases. - Use `defined_in` and `alias` only if you have good reason. - Create a view that always points to the latest version of your model. You can automate this for all versioned models in your project with an `on-run-end` hook. For more details, read the full docs on ["Model versions"](/docs/collaborate/govern/model-versions#configuring-database-location-with-alias) + +### Detecting breaking changes + +When you use the `state:modified` selection method in Slim CI, dbt will detect changes to versioned model contracts, and raise an error if any of those changes could be breaking for downstream consumers. + +Breaking changes include: +- Removing an existing column +- Changing the `data_type` of an existing column +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) +- Changing unversioned, contracted models. + - dbt also warns if a model has or had a contract but isn't versioned + + + + + +``` + Breaking Change to Unversioned Contract for contracted_model (models/contracted_models/contracted_model.sql) + While comparing to previous project state, dbt detected a breaking change to an unversioned model. + - Contract enforcement was removed: Previously, this model's configuration included contract: {enforced: true}. It is no longer configured to enforce its contract, and this is a breaking change. + - Columns were removed: + - color + - date_day + - Enforced column level constraints were removed: + - id (ConstraintType.not_null) + - id (ConstraintType.primary_key) + - Enforced model level constraints were removed: + - ConstraintType.check -> ['id'] + - Materialization changed with enforced constraints: + - table -> view +``` + + + + +``` +Breaking Change to Contract Error in model sometable (models/sometable.sql) + While comparing to previous project state, dbt detected a breaking change to an enforced contract. + + The contract's enforcement has been disabled. + + Columns were removed: + - order_name + + Columns with data_type changes: + - order_id (number -> int) + + Consider making an additive (non-breaking) change instead, if possible. + Otherwise, create a new model version: https://docs.getdbt.com/docs/collaborate/govern/model-versions +``` + + + + + + +Additive changes are **not** considered breaking: +- Adding a new column to a contracted model +- Adding new `constraints` to an existing column in a contracted model diff --git a/website/docs/reference/seed-properties.md b/website/docs/reference/seed-properties.md index d8b72737646..85e7be21ae1 100644 --- a/website/docs/reference/seed-properties.md +++ b/website/docs/reference/seed-properties.md @@ -2,12 +2,6 @@ title: Seed properties --- - - - **v1.0.0:** The default path for [`seed-paths`](/reference/project-configs/seed-paths) (formerly `data-paths`) is now `seeds`. - - - Seed properties can be declared in `.yml` files under a `seed` key. We recommend that you put them in the `seeds/` directory. You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within that directory. @@ -42,9 +36,3 @@ seeds: - name: ... # declare properties of additional seeds ``` - - - -* `v0.16.0`: The ability to declare seed properties was introduced. Prior to this, you could declare seed properties under the `models:` key (confusing, right?). Support for declaring seed properties under a `models:` key will be removed in a future release. - - diff --git a/website/docs/reference/snapshot-properties.md b/website/docs/reference/snapshot-properties.md index 48c5328a400..301747e9325 100644 --- a/website/docs/reference/snapshot-properties.md +++ b/website/docs/reference/snapshot-properties.md @@ -40,9 +40,3 @@ snapshots: ``` - - - -* `v0.16.0`: The ability to declare snapshot properties was introduced. - - diff --git a/website/docs/reference/snowflake-permissions.md b/website/docs/reference/snowflake-permissions.md index 80dbec25cc8..6a469d12230 100644 --- a/website/docs/reference/snowflake-permissions.md +++ b/website/docs/reference/snowflake-permissions.md @@ -15,9 +15,11 @@ grant usage on schema database.an_existing_schema to role role_name; grant create table on schema database.an_existing_schema to role role_name; grant create view on schema database.an_existing_schema to role role_name; grant usage on future schemas in database database_name to role role_name; +grant monitor on future schemas in database database_name to role role_name; grant select on future tables in database database_name to role role_name; grant select on future views in database database_name to role role_name; grant usage on all schemas in database database_name to role role_name; +grant monitor on all schemas in database database_name to role role_name; grant select on all tables in database database_name to role role_name; grant select on all views in database database_name to role role_name; ``` diff --git a/website/docs/reference/source-configs.md b/website/docs/reference/source-configs.md index ef428f5934c..3f9a19e78ca 100644 --- a/website/docs/reference/source-configs.md +++ b/website/docs/reference/source-configs.md @@ -1,5 +1,5 @@ --- -title: "About source configurations" +title: Source configurations description: "Learn how to use source configurations in dbt." id: source-configs --- @@ -37,8 +37,6 @@ sources: - - ```yaml @@ -57,27 +55,14 @@ sources: - - ## Configuring sources - - Sources can be configured via a `config:` block within their `.yml` definitions, or from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](/docs/build/packages). You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. - - - - -Sources can be configured from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](package-management). You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. - -Unlike other resource types, sources do not yet support a `config` property. It is not possible to (re)define source configs hierarchically across multiple YAML files. - - ### Examples #### Disable all sources imported from a package @@ -97,8 +82,6 @@ sources: - - #### Conditionally enable a single source When defining a source, you can disable the entire source, or specific source tables, using the inline `config` property: @@ -138,8 +121,6 @@ sources: - - #### Disable a single source from a package To disable a specific source from another package, qualify the resource path for your configuration with both a package name and a source name. In this case, we're disabling the `clickstream` source from the `events` package. diff --git a/website/docs/terms/data-lineage.md b/website/docs/terms/data-lineage.md index 41779e77702..a03687eaba3 100644 --- a/website/docs/terms/data-lineage.md +++ b/website/docs/terms/data-lineage.md @@ -63,7 +63,7 @@ In the greater data world, you may often hear of data lineage systems based on t If you use a transformation tool such as dbt that automatically infers relationships between data sources and models, a DAG automatically populates to show you the lineage that exists for your [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - + Your is used to visually show upstream dependencies, the nodes that must come before a current model, and downstream relationships, the work that is impacted by the current model. DAGs are also directional—they show a defined flow of movement and form non-cyclical loops. @@ -79,7 +79,7 @@ DAGs shouldn’t be dependent on manual updates. Instead, your DAG should be aut ### Third-party tooling -Data teams may also choose to use third-party tools  with  lineage capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), Metaphor, [Monte Carlo](https://docs.getmontecarlo.com/docs/dbt-cloud), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), or [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/). These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data lineage capabilities such as column-level or business logic-level lineage. +Data teams may also choose to use third-party tools with lineage capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Datafold](https://www.datafold.com/column-level-lineage), Metaphor, [Monte Carlo](https://docs.getmontecarlo.com/docs/dbt-cloud), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), or [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/). These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data lineage capabilities such as column-level or business logic-level lineage. ## Data lineage challenges diff --git a/website/docs/terms/monotonically-increasing.md b/website/docs/terms/monotonically-increasing.md index 397e333942a..b4e3987995d 100644 --- a/website/docs/terms/monotonically-increasing.md +++ b/website/docs/terms/monotonically-increasing.md @@ -1,11 +1,11 @@ --- id: monotonically-increasing title: Monotonically increasing -description: Monotonicity means unchanging (think monotone). A monotonically-increasing value is a value which increases at a constant rate, for example the values 1, 2, 3, 4. +description: A monotonically increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. displayText: monotonically increasing -hoverSnippet: Monotonicity means unchanging (think monotone). A monotonically-increasing value is a value which increases at a constant rate, for example the values 1, 2, 3, 4. +hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. --- -Monotonicity means unchanging (think monotone). A monotonically-increasing value is a value which increases at a constant rate, for example the values `[1, 2, 3, 4]`. +Monotonicity means unchanging (think monotone); a monotonic sequence is a sequence where the order of the value of the elements does not change. In other words, a monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example the sequences `[1, 6, 7, 11, 131]` or `[2, 5, 5, 5, 6, 10]`.. -Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. \ No newline at end of file +Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. diff --git a/website/docs/terms/predicate-pushdown.md b/website/docs/terms/predicate-pushdown.md new file mode 100644 index 00000000000..8e9bad85b6b --- /dev/null +++ b/website/docs/terms/predicate-pushdown.md @@ -0,0 +1,10 @@ +--- +id: predicate-pushdown +title: predicate pushdown +description: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query +displayText: Predicate pushdown +hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query +--- + +A predicate pushdown is an expression used to determine what rows in a database apply to a particular query. For example, if you filter in a `WHERE` clause based on a specific dimension value, the database searches to determine what values in the database apply to the query. The optimization known as "predicate pushdown" involves applying this filtering process to the database, leading to enhanced and faster query performance. + diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 111c3d1b4ae..0cc6299ed39 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -80,7 +80,7 @@ var siteSettings = { announcementBarLink: "https://www.getdbt.com/resources/dbt-cloud-demos-with-experts/?utm_medium=event&utm_source=docs&utm_campaign=q1-2024_cloud-demos-with-experts_awareness", // Set community spotlight member on homepage // This is the ID for a specific file under docs/community/spotlight - communitySpotlightMember: "david-effiong", + communitySpotlightMember: "faith-lierheimer", prism: { theme: (() => { var theme = require("prism-react-renderer/themes/nightOwl"); diff --git a/website/functions/image-cache-wrapper.js b/website/functions/image-cache-wrapper.js new file mode 100644 index 00000000000..aad2ffff200 --- /dev/null +++ b/website/functions/image-cache-wrapper.js @@ -0,0 +1,12 @@ +// This function is used to break the cache on images +// preventing stale or broken images from being served + +const CACHE_VERSION = '2' + +export default function imageCacheWrapper(src) { + const cacheParam = `?v=${CACHE_VERSION}` + + return ( + src + cacheParam + ) +} diff --git a/website/sidebars.js b/website/sidebars.js index 189a6c2b855..d287ebe2cf5 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -9,6 +9,8 @@ const sidebarSettings = { items: [ "docs/supported-data-platforms", "docs/connect-adapters", + "docs/verified-adapters", + "docs/trusted-adapters", "docs/community-adapters", "docs/contribute-core-adapters", ], @@ -139,10 +141,10 @@ const sidebarSettings = { "docs/cloud/secure/ip-restrictions", "docs/cloud/secure/about-privatelink", "docs/cloud/secure/snowflake-privatelink", - "docs/cloud/secure/redshift-privatelink", "docs/cloud/secure/databricks-privatelink", ], }, // PrivateLink + "docs/cloud/billing", ], }, { @@ -216,6 +218,7 @@ const sidebarSettings = { "docs/core/connect-data-platform/databend-setup", "docs/core/connect-data-platform/fal-setup", "docs/core/connect-data-platform/decodable-setup", + "docs/core/connect-data-platform/upsolver-setup", ], }, ], @@ -237,7 +240,6 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/build/sources" }, items: [ - "docs/build/sources", { type: "category", label: "Models", @@ -248,19 +250,24 @@ const sidebarSettings = { "docs/build/python-models", ], }, - "docs/build/seeds", "docs/build/snapshots", + "docs/build/seeds", + "docs/build/tests", + "docs/build/jinja-macros", + "docs/build/sources", "docs/build/exposures", "docs/build/metrics", "docs/build/groups", + "docs/build/analyses", ], }, { type: "category", label: "Build your metrics", - link: { type: "doc", id: "docs/build/build-metrics-intro"}, + link: { type: "doc", id: "docs/build/build-metrics-intro" }, collapsed: true, items: [ + "docs/build/sl-getting-started", { type: "category", label: "About MetricFlow", @@ -269,9 +276,9 @@ const sidebarSettings = { "docs/build/join-logic", "docs/build/validation", "docs/build/metricflow-time-spine", + "docs/build/metricflow-cli", ] }, - "docs/build/sl-getting-started", { type: "category", label: "Semantic models", @@ -285,7 +292,7 @@ const sidebarSettings = { { type: "category", label: "Metrics", - link: { type: "doc", id: "docs/build/metrics-overview"}, + link: { type: "doc", id: "docs/build/metrics-overview" }, items: [ "docs/build/cumulative", "docs/build/derived", @@ -301,7 +308,6 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/build/tests" }, items: [ - "docs/build/tests", "docs/build/materializations", "docs/build/incremental-models", ], @@ -312,11 +318,9 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/build/jinja-macros" }, items: [ - "docs/build/jinja-macros", "docs/build/project-variables", "docs/build/environment-variables", "docs/build/packages", - "docs/build/analyses", "docs/build/hooks-operations", ], }, @@ -344,22 +348,15 @@ const sidebarSettings = { items: [ "docs/deploy/job-scheduler", "docs/deploy/deploy-environments", + "docs/deploy/continuous-integration", { type: "category", - label: "dbt Cloud jobs", - link: { type: "doc", id: "docs/deploy/dbt-cloud-job" }, + label: "Jobs", + link: { type: "doc", id: "docs/deploy/jobs" }, items: [ - "docs/deploy/job-settings", + "docs/deploy/deploy-jobs", + "docs/deploy/ci-jobs", "docs/deploy/job-commands", - "docs/deploy/job-triggers", - ], - }, - { - type: "category", - label: "Continuous integration", - link: { type: "doc", id: "docs/deploy/continuous-integration" }, - items: [ - "docs/deploy/slim-ci-jobs", ], }, { @@ -382,6 +379,7 @@ const sidebarSettings = { type: "category", label: "Collaborate with others", items: [ + "docs/collaborate/explore-projects", { type: "category", label: "Git version control", @@ -427,10 +425,10 @@ const sidebarSettings = { collapsed: true, link: { type: "doc", id: "docs/use-dbt-semantic-layer/quickstart-semantic-layer" }, items: [ - "docs/use-dbt-semantic-layer/quickstart-semantic-layer", - "docs/use-dbt-semantic-layer/dbt-semantic-layer", - "docs/use-dbt-semantic-layer/setup-dbt-semantic-layer", + "docs/use-dbt-semantic-layer/quickstart-sl", + "docs/use-dbt-semantic-layer/setup-sl", "docs/use-dbt-semantic-layer/avail-sl-integrations", + "docs/use-dbt-semantic-layer/sl-architecture", ], }, { @@ -485,25 +483,65 @@ const sidebarSettings = { label: "Schema", link: { type: "doc", id: "docs/dbt-cloud-apis/discovery-schema-environment" }, items: [ - "docs/dbt-cloud-apis/discovery-schema-environment", - "docs/dbt-cloud-apis/discovery-schema-model", - "docs/dbt-cloud-apis/discovery-schema-models", - "docs/dbt-cloud-apis/discovery-schema-modelByEnv", - "docs/dbt-cloud-apis/discovery-schema-metric", - "docs/dbt-cloud-apis/discovery-schema-metrics", - "docs/dbt-cloud-apis/discovery-schema-source", - "docs/dbt-cloud-apis/discovery-schema-sources", - "docs/dbt-cloud-apis/discovery-schema-seed", - "docs/dbt-cloud-apis/discovery-schema-seeds", - "docs/dbt-cloud-apis/discovery-schema-snapshots", - "docs/dbt-cloud-apis/discovery-schema-test", - "docs/dbt-cloud-apis/discovery-schema-tests", - "docs/dbt-cloud-apis/discovery-schema-exposure", - "docs/dbt-cloud-apis/discovery-schema-exposures", + { + type: "category", + label: "Job", + link: { type: "doc", id: "docs/dbt-cloud-apis/discovery-schema-job" }, + items: [ + "docs/dbt-cloud-apis/discovery-schema-job-model", + "docs/dbt-cloud-apis/discovery-schema-job-models", + "docs/dbt-cloud-apis/discovery-schema-job-metric", + "docs/dbt-cloud-apis/discovery-schema-job-metrics", + "docs/dbt-cloud-apis/discovery-schema-job-source", + "docs/dbt-cloud-apis/discovery-schema-job-sources", + "docs/dbt-cloud-apis/discovery-schema-job-seed", + "docs/dbt-cloud-apis/discovery-schema-job-seeds", + // "docs/dbt-cloud-apis/discovery-schema-job-snapshot", + "docs/dbt-cloud-apis/discovery-schema-job-snapshots", + "docs/dbt-cloud-apis/discovery-schema-job-test", + "docs/dbt-cloud-apis/discovery-schema-job-tests", + "docs/dbt-cloud-apis/discovery-schema-job-exposure", + "docs/dbt-cloud-apis/discovery-schema-job-exposures", + // "docs/dbt-cloud-apis/discovery-schema-job-macro", + // "docs/dbt-cloud-apis/discovery-schema-job-macros", + ], + }, + { + type: "category", + label: "Environment", + link: { type: "doc", id: "docs/dbt-cloud-apis/discovery-schema-environment" }, + items: [ + { + type: "category", + label: "Applied", + items: [ + "docs/dbt-cloud-apis/discovery-schema-environment-applied-modelHistoricalRuns", + ], + }, + // Uncomment to add Definition subpage, but need to make items non-empty + // { + // type: "category", + // label: "Definition", + // items: [ + // // insert pages here + // ], + // }, + ], + }, ], }, ], }, + { + type: "category", + label: "Semantic Layer APIs", + link: { type: "doc", id: "docs/dbt-cloud-apis/sl-api-overview" }, + items: [ + "docs/dbt-cloud-apis/sl-jdbc", + "docs/dbt-cloud-apis/sl-graphql", + "docs/dbt-cloud-apis/sl-manifest", + ], + }, ], }, { @@ -606,6 +644,7 @@ const sidebarSettings = { "reference/resource-configs/doris-configs", "reference/resource-configs/fal-configs", "reference/resource-configs/oracle-configs", + "reference/resource-configs/upsolver-configs", ], }, { @@ -874,7 +913,8 @@ const sidebarSettings = { "guides/best-practices/how-we-structure/2-staging", "guides/best-practices/how-we-structure/3-intermediate", "guides/best-practices/how-we-structure/4-marts", - "guides/best-practices/how-we-structure/5-the-rest-of-the-project", + "guides/best-practices/how-we-structure/5-semantic-layer-marts", + "guides/best-practices/how-we-structure/6-the-rest-of-the-project", ], }, { @@ -895,30 +935,34 @@ const sidebarSettings = { }, { type: "category", - label: "Materializations best practices", + label: "How we build our metrics", link: { type: "doc", - id: "guides/best-practices/materializations/materializations-guide-1-guide-overview", + id: "guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro", }, items: [ - "guides/best-practices/materializations/materializations-guide-2-available-materializations", - "guides/best-practices/materializations/materializations-guide-3-configuring-materializations", - "guides/best-practices/materializations/materializations-guide-4-incremental-models", - "guides/best-practices/materializations/materializations-guide-5-best-practices", - "guides/best-practices/materializations/materializations-guide-6-examining-builds", - "guides/best-practices/materializations/materializations-guide-7-conclusion", + "guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup", + "guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models", + "guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics", + "guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart", + "guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics", + "guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion", ], }, { type: "category", - label: "dbt Cloud Environment best practices", + label: "Materializations best practices", link: { type: "doc", - id: "guides/best-practices/environment-setup/1-env-guide-overview", + id: "guides/best-practices/materializations/materializations-guide-1-guide-overview", }, items: [ - "guides/best-practices/environment-setup/2-one-deployment-environment", - "guides/best-practices/environment-setup/3-many-deployment-environments", + "guides/best-practices/materializations/materializations-guide-2-available-materializations", + "guides/best-practices/materializations/materializations-guide-3-configuring-materializations", + "guides/best-practices/materializations/materializations-guide-4-incremental-models", + "guides/best-practices/materializations/materializations-guide-5-best-practices", + "guides/best-practices/materializations/materializations-guide-6-examining-builds", + "guides/best-practices/materializations/materializations-guide-7-conclusion", ], }, "guides/best-practices/debugging-errors", @@ -951,13 +995,26 @@ const sidebarSettings = { }, { type: "category", - label: "Customizing CI/CD", + label: "Set up Continuous Integration", + link: { + type: "doc", + id: "guides/orchestration/set-up-ci/introduction", + }, + items: [ + "guides/orchestration/set-up-ci/quick-setup", + "guides/orchestration/set-up-ci/run-dbt-project-evaluator", + "guides/orchestration/set-up-ci/lint-on-push", + "guides/orchestration/set-up-ci/multiple-checks", + ], + }, + { + type: "category", + label: "Custom Continuous Deployment Workflows", link: { type: "doc", id: "guides/orchestration/custom-cicd-pipelines/1-cicd-background", }, items: [ - "guides/orchestration/custom-cicd-pipelines/2-lint-on-push", "guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge", "guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr", "guides/orchestration/custom-cicd-pipelines/5-something-to-consider", @@ -987,6 +1044,7 @@ const sidebarSettings = { type: "category", label: "Migration", items: [ + "guides/migration/sl-migration", { type: "category", label: "Versions", @@ -1060,6 +1118,7 @@ const sidebarSettings = { "guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", "guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter", "guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter", + "guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter", ], }, { @@ -1169,6 +1228,7 @@ const sidebarSettings = { "community/resources/oss-expectations", "community/resources/oss-projects", "community/resources/contributor-license-agreements", + "community/resources/jobs-terms-and-conditions", "community/resources/speaking-at-a-meetup", ], }, diff --git a/website/snippets/_adapters-trusted.md b/website/snippets/_adapters-trusted.md new file mode 100644 index 00000000000..10af0218e22 --- /dev/null +++ b/website/snippets/_adapters-trusted.md @@ -0,0 +1,8 @@ +
      + + + +
      diff --git a/website/snippets/_adapters-verified.md b/website/snippets/_adapters-verified.md new file mode 100644 index 00000000000..7caf099b7d1 --- /dev/null +++ b/website/snippets/_adapters-verified.md @@ -0,0 +1,62 @@ +
      + + + + + + + + + + + + + + + + + + + + + + +
      + +* Install these adapters using the CLI as they're not currently supported in dbt Cloud.
      diff --git a/website/snippets/_cloud-environments-info.md b/website/snippets/_cloud-environments-info.md index d8ea7e3d799..5388379dc34 100644 --- a/website/snippets/_cloud-environments-info.md +++ b/website/snippets/_cloud-environments-info.md @@ -42,3 +42,36 @@ By default, all environments will use the default branch in your repository (usu - **Deployment:** determines the branch is cloned during job executions for each environment. For more info, check out this [FAQ page on this topic](/faqs/Environments/custom-branch-settings)! + + +### Extended attributes (Beta) + +:::important This feature is currently in beta + +Extended Attributes is currently in [beta](/docs/dbt-versions/product-lifecycles?) for select users and is subject to change. +::: + +Extended Attributes is a feature that allows users to set a flexible [profiles.yml](/docs/core/connect-data-platform/profiles.yml) snippet in their dbt Cloud Environment settings. It provides users with more control over environments (both deployment and development) and extends how dbt Cloud connects to the data platform within a given environment. + +Extended Attributes is a text box extension at the environment level that overrides connection or environment credentials, including any custom environment variables. You can set any YAML attributes that a dbt adapter accepts in its `profiles.yml`. + +Something to note, Extended Attributes doesn't mask secret values. We recommend avoiding setting secret values to prevent visibility in the text box and logs. + +
      + +If you're developing in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [orchestrating job runs](/docs/deploy/deployments), Extended Attributes parses through the provided YAML and extracts the `profiles.yml` attributes. For each individual attribute: + +- If the attribute exists in another source (such as your project settings), it will replace its value (like environment-level values) in the profile. It also overrides any custom environment variables. + +- If the attribute doesn't exist, it will add the attribute or value pair to the profile. + +Only the **top-level keys** are accepted in extended attributes. This means that if you want to change a specific sub-key value, you must provide the entire top-level key as a JSON block in your resulting YAML. For example, if you want to customize a particular field within a [service account JSON](/docs/core/connect-data-platform/bigquery-setup#service-account-json) for your BigQuery connection (like 'project_id' or 'client_email'), you need to provide an override for the entire top-level `keyfile_json` main key/attribute using extended attributes. Include the sub-fields as a nested JSON block. + +The following code is an example of the types of attributes you can add in the **Extended Attributes** text box: + +```yaml +dbname: jaffle_shop +schema: dbt_alice +threads: 4 +``` + diff --git a/website/snippets/_discovery_api_job_deprecation_notice.md b/website/snippets/_discovery_api_job_deprecation_notice.md new file mode 100644 index 00000000000..71e80a958b4 --- /dev/null +++ b/website/snippets/_discovery_api_job_deprecation_notice.md @@ -0,0 +1,7 @@ +:::caution +dbt Labs is making changes to the Discovery API. These changes will take effect on September 7, 2023. + +The data type `Int` for `id` is being deprecated and will be replaced with `BigInt`. Currently, both data types are supported. + +To perform job-based queries, you must do it within the `job` schema object, and move the `jobId` and `runId` arguments to `job(...)`. This is now supported so you can update your API calls accordingly. For details, refer to [Job object schema](/docs/dbt-cloud-apis/discovery-schema-job). +::: diff --git a/website/snippets/_enterprise-permissions-table.md b/website/snippets/_enterprise-permissions-table.md new file mode 100644 index 00000000000..779c5bcb479 --- /dev/null +++ b/website/snippets/_enterprise-permissions-table.md @@ -0,0 +1,90 @@ + +Key: + +* (W)rite — Create new or modify existing. Includes `send`, `create`, `delete`, `allocate`, `modify`, and `read`. +* (R)ead — Can view but can not create or change any fields. + +Permissions: + +* Account-level permissions — Permissions related to management of the dbt Cloud account. For example, billing and account settings. +* Project-level permissions — Permissions related to the projects in dbt Cloud. For example, repos and access to the IDE. + +### Account roles +Account roles enable you to manage the dbt Cloud account and manage the account settings (for example, generating service tokens, inviting users, configuring SSO). They also provide project-level permissions. The **Account Admin** role is the highest level of access you can assign. + +#### Account permissions for account roles + +| Account-level permission| Account Admin | Billing admin | Project creator | Security admin | Viewer | +|:-------------------------|:-------------:|:-------------:|:---------------:|:--------------:|:------:| +| Account settings | W | | R | R | R | +| Audit logs | R | | | R | | +| Auth provider | W | | | W | R | +| Billing | W | W | | | R | +| Invitations | W | | W | W | R | +| IP restrictions | W | | | W | R | +| Members | W | | W | W | R | +| Project (create) | W | | W | | | +| Public models | R | R | R | R | R | +| Service tokens | W | | | R | | +| Webhooks | W | | | | | + +#### Project permissions for account roles + +|Project-level permission | Account Admin | Billing admin | Project creator | Security admin | Viewer | +|:-------------------------|:-------------:|:-------------:|:---------------:|:--------------:|:------:| +| Connections | W | | W | | R | +| Credentials | W | | W | | R | +| Custom env. variables | W | | W | | R | +| dbt adapters | W | | W | | R | +| Develop (IDE) | W | | W | | | +| Environments | W | | W | | R | +| Groups | W | | R | W | R | +| Jobs | W | | W | | R | +| Licenses | W | | W | W | R | +| Metadata | R | | R | | R | +| Permissions | W | | W | W | R | +| Profile | W | | W | | R | +| Projects | W | | W | R | R | +| Repositories | W | | W | | R | +| Runs | W | | W | | R | +| Semantic Layer Config | W | | W | | R | + + +### Project role permissions + +The project roles enable you to work within the projects in various capacities. They primarily provide access to project-level permissions such as repos and the IDE, but may also provide some account-level permissions. + +#### Account permissions for project roles + +| Account-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job viewer | Metadata | Semantic Layer | Stakeholder | Team admin | Webhook | +|--------------------------|:-----:|:-------:|:--------------:|:---------:|:---------:|:---------:|:-----------:|:--------:|:--------------:|:-----------:|:----------:|:------:| +| Account settings | R | | R | | R | | | | | | R | | +| Auth provider | | | | | | | | | | | | | +| Billing | | | | | | | | | | | | | +| Invitations | W | R | R | R | R | R | R | | | R | R | | +| Members | W | | R | R | R | | | | | R | R | | +| Project (create) | | | | | | | | | | | | | +| Public models | R | R | R | R | R | R | R | R | R | R | R | R | +| Service tokens | | | | | | | | | | | | | +| Webhooks | W | | | W | | | | | | | | W | + +#### Project permissions for project roles + +|Project-level permission | Admin | Analyst | Database admin | Developer | Git Admin | Job admin | Job viewer | Metadata | Semantic Layer | Stakeholder | Team admin | Webhook | +|--------------------------|:-----:|:-------:|:--------------:|:---------:|:---------:|:---------:|:-----------:|:--------:|:--------------:|:-----------:|:----------:|:------:| +| Connections | W | R | W | R | R | R | | | | R | R | | +| Credentials | W | W | W | W | R | W | | | | R | R | | +| Custom env. variables | W | W | W | W | W | W | R | | | R | W | | +| dbt adapters | W | W | W | W | R | W | | | | R | R | | +| Develop (IDE) | W | W | | W | | | | | | | | | +| Environments | W | R | R | R | R | W | R | | | R | R | | +| Groups | R | | R | R | R | | | | | R | R | | +| Jobs | W | R | R | W | R | W | R | | | R | R | | +| Licenses | W | R | R | R | R | R | R | | | | R | | +| Metadata | R | R | R | R | R | R | R | R | | R | R | | +| Permissions | W | | R | R | R | | | | | | W | | +| Profile | W | R | W | R | R | R | | | | R | R | | +| Projects | W | W | W | W | W | R | R | | | R | W | | +| Repositories | W | | R | R | W | | | | | R | R | | +| Runs | W | R | R | W | R | W | R | | | R | R | | +| Semantic Layer Config | W | R | W | R | R | R | | | W | R | R | | diff --git a/website/snippets/_explorer-beta-banner.md b/website/snippets/_explorer-beta-banner.md new file mode 100644 index 00000000000..ab501c7bd0f --- /dev/null +++ b/website/snippets/_explorer-beta-banner.md @@ -0,0 +1,3 @@ +:::info Beta +This feature is related to dbt Explorer and cross-project references [beta](/docs/dbt-versions/product-lifecycles#dbt-cloud) projects and subject to change. If you are interested in getting access to the beta, please [contact us](mailto:support@getdbt.com). +::: diff --git a/website/snippets/_explorer-beta-note.md b/website/snippets/_explorer-beta-note.md new file mode 100644 index 00000000000..3bdcd5bcf7a --- /dev/null +++ b/website/snippets/_explorer-beta-note.md @@ -0,0 +1 @@ +**Note:** Make sure to set the environment to "Production" so you can take advantage of features like dbt Explorer and cross-project references. Refer to [Set product environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) for details. diff --git a/website/snippets/_legacy-sl-callout.md b/website/snippets/_legacy-sl-callout.md new file mode 100644 index 00000000000..f45c6b68af3 --- /dev/null +++ b/website/snippets/_legacy-sl-callout.md @@ -0,0 +1,11 @@ +:::important Upgrade to access the new dbt Semantic Layer + +The dbt Semantic Layer has undergone a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), improving governance, introducing a new API, and making it more efficient to define and query metrics. The legacy Semantic Layer, available in dbt v1.5 or lower, is no longer supported and won't receive any code fixes. + +**Who does this affect?** Anyone who uses the legacy Semantic Layer. The new Semantic Layer is available to [Team or Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plans [hosted in North America](/docs/cloud/about-cloud/regions-ip-addresses) (more regions coming soon). You must be on dbt v1.6 or higher to access it. Users on dbt Cloud Developer plans or dbt Core users can use MetricFlow to only define and test metrics locally. + +**What’s changed?** The dbt_metrics package has been [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. + +**What should you do?** If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. + +::: diff --git a/website/snippets/_new-sl-changes.md b/website/snippets/_new-sl-changes.md new file mode 100644 index 00000000000..6eca327001a --- /dev/null +++ b/website/snippets/_new-sl-changes.md @@ -0,0 +1,8 @@ + +:::tip Introducing the new dbt Semantic Layer 🎉 + +The dbt Semantic Layer has been re-released with [significant improvements](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), making it more efficient to define and query metrics. + +The new version is available in [public beta](/docs/dbt-versions/release-notes/Aug-2023/sl-revamp-beta#public-beta) and introduces [MetricFlow](/docs/build/about-metricflow), an essential component. It also includes new semantic elements, better governance, improved efficiency, easier data access, and new dbt Semantic Layer APIs. + +::: diff --git a/website/snippets/_new-sl-setup.md b/website/snippets/_new-sl-setup.md new file mode 100644 index 00000000000..b802db9c5ae --- /dev/null +++ b/website/snippets/_new-sl-setup.md @@ -0,0 +1,39 @@ +You can set up the dbt Semantic Layer in dbt Cloud at the environment and project level. Before you begin: + +- You must have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment, hosted in North America. +- You must be part of the Owner group, and have the correct [license](/docs/cloud/manage-access/seats-and-users) and [permissions](/docs/cloud/manage-access/self-service-permissions) to configure the Semantic Layer: + * Enterprise plan — Developer license with Account Admin permissions. Or Owner with a Developer license, assigned Project Creator, Database Admin, or Admin permissions. + * Team plan — Owner with a Developer license. +- You must have a successful run in your new environment. + +:::tip +If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. +::: + +1. In dbt Cloud, create a new [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) or use an existing environment on dbt 1.6 or higher. + * Note — Deployment environment is currently supported (_development experience coming soon_) + +2. Navigate to **Account Settings** and select the specific project you want to enable the Semantic Layer for. + +3. In the **Project Details** page, navigate to the **Semantic Layer** section, and select **Configure Semantic Layer**. + + + +4. In the **Set Up Semantic Layer Configuration** page, enter the credentials you want the Semantic Layer to use specific to your data platform. We recommend credentials have the least privileges required because your Semantic Layer users will be querying it in downstream applications. At a minimum, the Semantic Layer needs to have read access to the schema(s) that contains the dbt models that you used to build your semantic models. + + + +5. Select the deployment environment you want for the Semantic Layer and click **Save**. + +6. After saving it, you'll be provided with the connection information that allows you to connect to downstream tools. If your tool supports JDBC, save the JDBC URL or individual components (like environment id and host). If it uses the GraphQL API, save the GraphQL API host information instead. + + + +7. Save and copy your environment ID, service token, and host, which you'll need to use downstream tools. For more info on how to integrate with partner integrations, refer to [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations). + +8. Return to the **Project Details** page, then select **Generate Service Token**. You will need Semantic Layer Only and Metadata Only [service token](/docs/dbt-cloud-apis/service-tokens) permissions. + + + +Great job, you've configured the Semantic Layer 🎉! + diff --git a/website/snippets/_onrunstart-onrunend-commands.md b/website/snippets/_onrunstart-onrunend-commands.md new file mode 100644 index 00000000000..68d693ce426 --- /dev/null +++ b/website/snippets/_onrunstart-onrunend-commands.md @@ -0,0 +1 @@ +dbt build, dbt compile, dbt docs generate, dbt run, dbt seed, dbt snapshot, or dbt test. diff --git a/website/snippets/_sl-configure-metricflow.md b/website/snippets/_sl-configure-metricflow.md new file mode 100644 index 00000000000..10f92161783 --- /dev/null +++ b/website/snippets/_sl-configure-metricflow.md @@ -0,0 +1 @@ +MetricFlow requires a time spine for certain metric types and join resolution patterns, like cumulative metrics. You will have to create this model in your dbt project. [This article](/docs/build/metricflow-time-spine) explains how to add the `metricflow_time_spine` model to your project. diff --git a/website/snippets/_sl-create-semanticmodel.md b/website/snippets/_sl-create-semanticmodel.md new file mode 100644 index 00000000000..bc4276efcb6 --- /dev/null +++ b/website/snippets/_sl-create-semanticmodel.md @@ -0,0 +1,150 @@ +The following steps will walk you through setting up semantic models, which you can do with the dbt Cloud IDE or the CLI. Semantic models consist of [entities](/docs/build/entities), [dimensions](/docs/build/dimensions), and [measures](/docs/build/measures). + +We highly recommend you read the overview of what a [semantic model](/docs/build/semantic-models) is before getting started. If you're working in the [Jaffle shop example](https://github.com/dbt-labs/jaffle-sl-template), delete the `orders.yml` config or delete the .yml extension so it's ignored during parsing. **We'll be rebuilding it step by step in this example.** + +If you're following the guide in your own project, pick a model that you want to build a semantic manifest from and fill in the config values accordingly. + +1. Create a new yml config file for the orders model, such as `orders.yml`. + +It's best practice to create semantic models in the `/models/semantic_models` directory in your project. Semantic models are nested under the `semantic_models` key. First, fill in the name and appropriate metadata, map it to a model in your dbt project, and specify model defaults. For now, `default_agg_time_dimension` is the only supported default. + +```yaml +semantic_models: + #The name of the semantic model. + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + #The name of the dbt model and schema + model: ref('orders') + ``` + +2. Define your entities. These are the keys in your table that MetricFlow will use to join other semantic models. These are usually columns like `customer_id`, `order_id`, and so on. + +```yaml + #Entities. These usually correspond to keys in the table. + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + ``` + +3. Define your dimensions and measures. Dimensions are properties of the records in your table that are non-aggregatable. They provide categorical or time-based context to enrich metrics. Measures are the building block for creating metrics. They are numerical columns that MetricFlow aggregates to create metrics. + +```yaml + #Measures. These are the aggregations on the columns in the table. + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum + - name: customers_with_orders + description: Distinct count of customers placing orders + agg: count_distinct + expr: customer_id + - name: locations_with_orders + description: Distinct count of locations with order + expr: location_id + agg: count_distinct + - name: order_cost + description: The cost for each order item. Cost is calculated as a sum of the supply cost for each order item. + agg: sum + #Dimensions. Either categorical or time. These add additional context to metrics. The typical querying pattern is Metric by Dimension. + dimensions: + - name: ordered_at + type: time + type_params: + time_granularity: day + - name: order_total_dim + type: categorical + expr: order_total + - name: is_food_order + type: categorical + - name: is_drink_order + type: categorical +``` + +Putting it all together, a complete semantic model configurations based on the order model would look like the following example: + +```yaml +semantic_models: + #The name of the semantic model. + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + #The name of the dbt model and schema + model: ref('orders') + #Entities. These usually corespond to keys in the table. + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + #Measures. These are the aggregations on the columns in the table. + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum + - name: customers_with_orders + description: Distinct count of customers placing orders + agg: count_distinct + expr: customer_id + - name: locations_with_orders + description: Distinct count of locations with order + expr: location_id + agg: count_distinct + - name: order_cost + description: The cost for each order item. Cost is calculated as a sum of the supply cost for each order item. + agg: sum + #Dimensions. Either categorical or time. These add additional context to metrics. The typical querying pattern is Metric by Dimension. + dimensions: + - name: ordered_at + type: time + type_params: + time_granularity: day + - name: order_total_dim + type: categorical + expr: order_total + - name: is_food_order + type: categorical + - name: is_drink_order + type: categorical +``` + +:::tip +If you're familiar with writing SQL, you can think of dimensions as the columns you would group by and measures as the columns you would aggregate. + +```sql +select + metric_time_day, -- time + country, -- categorical dimension + sum(revenue_usd) -- measure +from + snowflake.fact_transactions -- sql table +group by metric_time_day, country -- dimensions + ``` + +::: diff --git a/website/snippets/_sl-define-metrics.md b/website/snippets/_sl-define-metrics.md new file mode 100644 index 00000000000..29af3f5b7c3 --- /dev/null +++ b/website/snippets/_sl-define-metrics.md @@ -0,0 +1,21 @@ +Now that you've created your first semantic model, it's time to define your first metric! You can define metrics with the dbt Cloud IDE or CLI. + +MetricFlow supports different metric types like [simple](/docs/build/simple), [ratio](/docs/build/ratio), [cumulative](/docs/build/cumulative), and [derived](/docs/build/derived). It's recommended that you read the [metrics overview docs](/docs/build/metrics-overview) before getting started. + +1. You can define metrics in the same YAML files as your semantic models or create a new file. If you want to create your metrics in a new file, create another directory called `/models/metrics`. The file structure for metrics can become more complex from here if you need to further organize your metrics, for example, by data source or business line. + +2. The example metric we'll create is a simple metric that refers directly to the the `order_total` measure, which will be implemented as a `sum()` function in SQL. Again, if you're working in the Jaffle shop sandbox, we recommend deleting the original `orders.yml` file, or removing the .yml extension so it's ignored during parsing. We'll be rebuilding the `order_total` metric from scratch. If you're working in your own project, create a simple metric like the one below using one of the measures you created in the previous step. + +```yaml +metrics: + - name: order_total + description: Sum of total order amount. Includes tax + revenue. + type: simple + label: Order Total + type_params: + measure: order_total +``` + +3. Save your code, and in the next section, you'll validate your configs before committing them to your repository. + +To continue building out your metrics based on your organization's needs, refer to the [Build your metrics](/docs/build/build-metrics-intro) for detailed info on how to define different metric types and semantic models. diff --git a/website/snippets/_sl-install-metricflow.md b/website/snippets/_sl-install-metricflow.md new file mode 100644 index 00000000000..73e60d34e85 --- /dev/null +++ b/website/snippets/_sl-install-metricflow.md @@ -0,0 +1,8 @@ +Install the [MetricFlow CLI](/docs/build/metricflow-cli) as an extension of a dbt adapter from PyPI. The MetricFlow CLI is compatible with Python versions 3.8, 3.9, 3.10 and 3.11 + +Use pip install `metricflow` and your [dbt adapter](/docs/supported-data-platforms): + +- Create or activate your virtual environment. `python -m venv venv` or `source your-venv/bin/activate` +- Run `pip install "dbt-metricflow[your_adapter_name]"` + - You must specify `[your_adapter_name]`. + - For example, run `pip install "dbt-metricflow[snowflake]"` if you use a Snowflake adapter. diff --git a/website/snippets/_sl-partner-links.md b/website/snippets/_sl-partner-links.md new file mode 100644 index 00000000000..e9cc6af3564 --- /dev/null +++ b/website/snippets/_sl-partner-links.md @@ -0,0 +1,11 @@ + +The dbt Semantic Layer integrations are capable of querying dbt metrics, importing definitions, surfacing the underlying data in partner tools, and more. These are the following tools that integrate with the dbt Semantic Layer: + +1. **Mode** — To learn more about integrating with Mode, check out their [documentation](https://mode.com/help/articles/supported-databases/#dbt-semantic-layer) for more info. +2. **Hex** — To learn more about integrating with Hex, check out their [documentation](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration#dbt-semantic-layer-integration) for more info. Additionally, refer to [dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. +3. **Google Sheets** — Google Sheets integration coming soon. +4. **Tools that allows you to write SQL** — They must meet one of the two criteria: + * Supports a generic JDBC driver option (such as DataGrip) or + * Supports Dremio and uses ArrowFlightSQL driver version 12.0.0 or higher. + +Before you connect to these tools, you'll need to first [set up the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl) and [generate a service token](/docs/dbt-cloud-apis/service-tokens) to create a Semantic Layer Only and Metadata Only service token. diff --git a/website/snippets/_sl-plan-info.md b/website/snippets/_sl-plan-info.md new file mode 100644 index 00000000000..5fba18de6bb --- /dev/null +++ b/website/snippets/_sl-plan-info.md @@ -0,0 +1,2 @@ +To define and query metrics with the {props.product}, you must be on a {props.plan} multi-tenant plan, {props.instance} (Additional region support coming soon).

      The re-released dbt Semantic Layer is available on dbt v1.6 or higher. dbt Core users can use the MetricFlow CLI to define metrics in their local project, but won't be able to dynamically query them with integrated tools.


      + diff --git a/website/snippets/_sl-test-and-query-metrics.md b/website/snippets/_sl-test-and-query-metrics.md new file mode 100644 index 00000000000..b250fac4f31 --- /dev/null +++ b/website/snippets/_sl-test-and-query-metrics.md @@ -0,0 +1,31 @@ +:::important Testing and querying metrics in the dbt Cloud IDE not yet supported + +Support for testing or querying metrics in the dbt Cloud IDE is not available in the current beta but is coming soon. + +You can use the **Preview** or **Compile** buttons in the IDE to run semantic validations and make sure your metrics are defined. You can [dynamically query metrics](#connect-and-query-api) with integrated tools on a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) plan using the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview). + +Currently, you can define and test metrics using the MetricFlow CLI. dbt Cloud IDE support is coming soon. Alternatively, you can test using SQL client tools like DataGrip, DBeaver, or RazorSQL. + +::: + +This section will explain how you can test and query metrics using the MetricFlow CLI (dbt Cloud IDE support coming soon). + +Before you begin, you'll need to install the [MetricFlow CLI](/docs/build/metricflow-cli) package and make sure you run at least one model. +### Install MetricFlow + +import InstallMetricFlow from '/snippets/_sl-install-metricflow.md'; + + + +### Query and commit your metrics using the CLI + +MetricFlow needs a `semantic_manifest.json` in order to build a semantic graph. To generate a semantic_manifest.json artifact run `dbt parse`. This will create the file in your `/target` directory. If you're working from the Jaffle shop example, run `dbt seed && dbt run` before preceding to ensure the data exists in your warehouse. + +1. Make sure you have the MetricFlow CLI installed and up to date. +2. Run `mf --help` to confirm you have MetricFlow installed and view the available commands. +3. Run `mf query --metrics --group-by ` to query the metrics and dimensions. For example, `mf query --metrics order_total --group-by metric_time` +4. Verify that the metric values are what you expect. To further understand how the metric is being generated, you can view the generated SQL if you type `--explain` in the CLI. +5. Run `mf validate-configs` to run validation on your semantic models and metrics. +6. Commit and merge the code changes that contain the metric definitions. + +To streamline your metric querying process, you can connect to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) to access your metrics programmatically. For SQL syntax, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. diff --git a/website/snippets/_v2-sl-prerequisites.md b/website/snippets/_v2-sl-prerequisites.md new file mode 100644 index 00000000000..9fdc3b53143 --- /dev/null +++ b/website/snippets/_v2-sl-prerequisites.md @@ -0,0 +1,41 @@ + + + +To use the Semantic Layer, you must: + +- Have a dbt Cloud Team or Enterprise [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment, hosted in North America. +- Have both your production and development environments running dbt version 1.6 or higher. Refer to [upgrade in dbt Cloud](/docs/dbt-versions/upgrade-core-in-cloud) for more info. +- Use Snowflake, BigQuery, Databricks, or Redshift (dbt Cloud Postgres support coming soon). +- Create a successful run in the environment where you configure the Semantic Layer. + - **Note:** Semantic Layer currently supports the Deployment environment for querying. (_development querying experience coming soon_) +- Set up the [Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) in the integrated tool to import metric definitions. + - **Note:** To access the API and query metrics in downstream tools, you must have a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. dbt Core or Developer accounts can define metrics with the [MetricFlow CLI](/docs/build/metricflow-cli) or [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) but won't be able to dynamically query them.
      +- Understand [MetricFlow's](/docs/build/about-metricflow) key concepts, which powers the revamped dbt Semantic Layer. + + +
      + + + + +- Have a multi-tenant dbt Cloud instance, hosted in North America
      +- Have both your production and development environments running dbt version 1.3 or higher
      +- Use Snowflake data platform
      +- Install the dbt metrics package version >=1.3.0, <1.4.0 in your dbt project
      + * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. +- Set up the Discovery API in the integrated tool to import metric definitions + * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      + +
      + + + +- Have a multi-tenant dbt Cloud instance, hosted in North America
      +- Have both your production and development environments running dbt version 1.2
      +- Use Snowflake data platform
      +- Install the dbt metrics package version >=0.3.0, <0.4.0 in your dbt project
      + * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. +- Set up the Discovery API in the integrated tool to import metric definitions + * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      + +
      diff --git a/website/snippets/_version-callout.md b/website/snippets/_version-callout.md new file mode 100644 index 00000000000..45c183c2c0d --- /dev/null +++ b/website/snippets/_version-callout.md @@ -0,0 +1,8 @@ + +:::info Model versions, dbt_project.yml versions, and .yml versions + +Take note that [model versions](/docs/collaborate/govern/model-versions) are different from [dbt_project.yml versions](/reference/project-configs/version#dbt_projectyml-versions) and [.yml property file versions](/reference/project-configs/version#yml-property-file-versions). + +Model versions is a _feature_ that enables better governance and data model management by allowing you to track changes and updates to models over time. dbt_project.yml versions refer to the compatibility of the dbt project with a specific version of dbt. Version numbers within .yml property files inform how dbt parses those YAML files. The latter two are completely optional starting from dbt v1.5. + +::: diff --git a/website/snippets/auth0-uri.md b/website/snippets/auth0-uri.md index c6e0f48d0df..1187902f2e4 100644 --- a/website/snippets/auth0-uri.md +++ b/website/snippets/auth0-uri.md @@ -3,8 +3,9 @@ The URI used for SSO connections on multi-tenant dbt Cloud instances will vary b | Region | dbt Cloud Access URL | Auth0 SSO URI | Auth0 Entity ID * | |--------|-----------------------|-------------------------------|----------------------------------------| -| US | cloud.getdbt.com | https://auth.cloud.getdbt.com/ | us-production-mt | -| EMEA | emea.dbt.com | https://auth.emea.dbt.com/ | emea-production-mt | -| APAC | au.dbt.com | https://auth.au.dbt.com/ | au-production-mt | +| US multi-tenant | cloud.getdbt.com | auth.cloud.getdbt.com | us-production-mt | +| US cell 1 | {account prefix}.us1.dbt.com | auth.cloud.getdbt.com | us-production-mt | +| EMEA | emea.dbt.com | auth.emea.dbt.com | emea-production-mt | +| APAC | au.dbt.com | auth.au.dbt.com | au-production-mt | -*Only applicable to SAML and Okta configurations. \ No newline at end of file +*Only applicable to SAML and Okta configurations. diff --git a/website/snippets/available-enterprise-tier-only.md b/website/snippets/available-enterprise-tier-only.md index 0d75b72287e..184b8da6c34 100644 --- a/website/snippets/available-enterprise-tier-only.md +++ b/website/snippets/available-enterprise-tier-only.md @@ -2,6 +2,6 @@ Connecting an Azure DevOps cloud account is available for organizations using the dbt Cloud Enterprise tier. -Azure DevOps on-premise instances are not supported in dbt Cloud. +dbt Cloud's native Azure DevOps integration does not support Azure DevOps Server (on-premise). Instead, you can [import a project by git URL](/docs/cloud/git/import-a-project-by-git-url) to connect to an Azure DevOps Server. ::: diff --git a/website/snippets/cloud-feature-parity.md b/website/snippets/cloud-feature-parity.md index bcaa2ef3784..7bc6c91e9ba 100644 --- a/website/snippets/cloud-feature-parity.md +++ b/website/snippets/cloud-feature-parity.md @@ -5,10 +5,10 @@ The following table outlines which dbt Cloud features are supported on the diffe | Scheduler | ✅ | ✅ | ✅ | | Cloud IDE | ✅ | ✅ | ✅ | | Audit logs | ✅ | ✅ | ✅ | -| Discovery API | ✅ | ✅ (select customers) | ❌ | -| Webhooks (Outbound) | ✅ | ❌ | ❌ | -| Continuous Integration, including Slim CI | ✅ | ✅ | ✅ | +| Discovery API | ✅ | ✅ | ❌ | +| Webhooks (Outbound) | ✅ | ✅ | ❌ | +| Continuous Integration, including CI jobs | ✅ | ✅ | ✅ | | Semantic Layer | ✅ (North America Only) | ❌ | ❌ | | IP Restrictions | ✅ | ✅ | ✅ | -| PrivateLink egress | ✅ | ✅ | ✅ | +| PrivateLink egress | ✅ (AWS only)| ✅ | ✅ | | PrivateLink ingress | ❌ | ✅ | ✅ | diff --git a/website/snippets/core-versions-table.md b/website/snippets/core-versions-table.md index 6997353545b..431e1f08b4c 100644 --- a/website/snippets/core-versions-table.md +++ b/website/snippets/core-versions-table.md @@ -1,14 +1,16 @@ ### Latest Releases | dbt Core | Initial Release | Support Level | Critical Support Until | -|------------------------------------------------------------|-----------------|---------------|-------------------------| -| [**v1.5**](/guides/migration/versions/upgrading-to-v1.5) | Apr 27, 2023 | Active | Apr 27, 2024 | -| [**v1.4**](/guides/migration/versions/upgrading-to-v1.4) | Jan 25, 2023 | Critical | Jan 25, 2024 | -| [**v1.3**](/guides/migration/versions/upgrading-to-v1.3) | Oct 12, 2022 | Critical | Oct 12, 2023 | -| [**v1.2**](/guides/migration/versions/upgrading-to-v1.2) | Jul 26, 2022 | Critical | Jul 26, 2023 | -| [**v1.1**](/guides/migration/versions/upgrading-to-v1.1) ⚠️ | Apr 28, 2022 | End of Life* ⚠️ | Apr 28, 2023 | -| [**v1.0**](/guides/migration/versions/upgrading-to-v1.0) ⚠️ | Dec 3, 2021 | End of Life* ⚠️ | Dec 3, 2022 ⚠️ | -| **v0.X** ⛔️ | (Various dates) | Deprecated ⛔️ | Deprecated ⛔️ | +|------------------------------------------------------------|-----------------|----------------|-------------------------| +| [**v1.7**](/guides/migration/versions/upgrading-to-v1.7) (beta)| Oct 26, 2023 | - | - | +| [**v1.6**](/guides/migration/versions/upgrading-to-v1.6) | Jul 31, 2023 | Active | Jul 30, 2024 | +| [**v1.5**](/guides/migration/versions/upgrading-to-v1.5) | Apr 27, 2023 | Critical | Apr 27, 2024 | +| [**v1.4**](/guides/migration/versions/upgrading-to-v1.4) | Jan 25, 2023 | Critical | Jan 25, 2024 | +| [**v1.3**](/guides/migration/versions/upgrading-to-v1.3) | Oct 12, 2022 | Critical | Oct 12, 2023 | +| [**v1.2**](/guides/migration/versions/upgrading-to-v1.2) | Jul 26, 2022 | End of Life* ⚠️ | Jul 26, 2023 | +| [**v1.1**](/guides/migration/versions/upgrading-to-v1.1) ⚠️ | Apr 28, 2022 | Deprecated ⛔️ | Deprecated ⛔️ | +| [**v1.0**](/guides/migration/versions/upgrading-to-v1.0) ⚠️ | Dec 3, 2021 | Deprecated ⛔️ | Deprecated ⛔️ | +| **v0.X** ⛔️ | (Various dates) | Deprecated ⛔️ | Deprecated ⛔️ | _*All versions of dbt Core since v1.0 are available in dbt Cloud until further notice. Versions that are EOL do not receive any fixes. For the best support, we recommend upgrading to a version released within the past 12 months._ ### Planned future releases @@ -16,7 +18,6 @@ _Future release dates are tentative and subject to change._ | dbt Core | Planned Release | Critical & dbt Cloud Support Until | |----------|-----------------|-------------------------------------| -| **v1.6** | _July 2023_ | _July 2024_ | | **v1.7** | _Oct 2023_ | _Oct 2024_ | | **v1.8** | _Jan 2024_ | _Jan 2025_ | | **v1.9** | _Apr 2024_ | _Apr 2025_ | diff --git a/website/snippets/quickstarts/change-way-model-materialized.md b/website/snippets/quickstarts/change-way-model-materialized.md index 08d1b5ca0d8..d8468bcf3bf 100644 --- a/website/snippets/quickstarts/change-way-model-materialized.md +++ b/website/snippets/quickstarts/change-way-model-materialized.md @@ -1,6 +1,6 @@ One of the most powerful features of dbt is that you can change the way a model is materialized in your warehouse, simply by changing a configuration value. You can change things between tables and views by changing a keyword rather than writing the data definition language (DDL) to do this behind the scenes. -By default, everything gets created as a view. You can override that by materializing everything in jaffle_shop as a table. Everything in the example project will still be materialized as a view. +By default, everything gets created as a view. You can override that at the directory level so everything in that directory will materialize to a different materialization. 1. Edit your `dbt_project.yml` file. - Update your project `name` to: @@ -11,7 +11,7 @@ By default, everything gets created as a view. You can override that by material ``` - - Update your `models` config block to: + - Configure `jaffle_shop` so everything in it will be materialized as a table; and configure `example` so everything in it will be materialized as a view. Update your `models` config block to: diff --git a/website/snippets/sl-prerequisites.md b/website/snippets/sl-prerequisites.md index 09ede745431..0c100c299b0 100644 --- a/website/snippets/sl-prerequisites.md +++ b/website/snippets/sl-prerequisites.md @@ -7,7 +7,7 @@ * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. - Set up the Discovery API in the integrated tool to import metric definitions * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
      +- Recommended - Review the dbt metrics page
      @@ -20,7 +20,7 @@ * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. - Set up the Discovery API in the integrated tool to import metric definitions * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
      +- Recommended - Review the dbt metrics page
      @@ -33,6 +33,6 @@ * **Note** — After installing the dbt metrics package and updating the `packages.yml` file, make sure you run at least one model. - Set up the Discovery API in the integrated tool to import metric definitions * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API
      -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
      +- Recommended - Review the dbt metrics page
      diff --git a/website/snippets/slack-notifications-config-steps.md b/website/snippets/slack-notifications-config-steps.md index e643d4c5644..da63f7afcc9 100644 --- a/website/snippets/slack-notifications-config-steps.md +++ b/website/snippets/slack-notifications-config-steps.md @@ -9,17 +9,17 @@ If there have been changes to the user roles and you need to move ownership, ple 1. Click the gear in the top right and select **Profile**. 2. Click **Integrations** to the left. - + 3. Click **Link your Slack profile** - + 4. Allow dbt Labs to access the Slack workspace. If you are a member of multiple, you can select the appropriate workspace from the dropdown menu in the top right corner. - + ### Configure the notifications 1. Click the gear in the top right and select **Account Settings**. 2. Click **Slack Notifications** to the left and click **Edit** to the right. - + 3. You can find the Slack notification settings at the bottom of the page. ### Disabling the Slack integration diff --git a/website/snippets/tutorial-document-your-models.md b/website/snippets/tutorial-document-your-models.md index dd9e1592145..9913dbcd1d7 100644 --- a/website/snippets/tutorial-document-your-models.md +++ b/website/snippets/tutorial-document-your-models.md @@ -40,7 +40,12 @@ Adding [documentation](/docs/collaborate/documentation) to your project allows y tests: - accepted_values: values: ['placed', 'shipped', 'completed', 'return_pending', 'returned'] - + - name: customer_id + tests: + - not_null + - relationships: + to: ref('stg_customers') + field: customer_id ```
      diff --git a/website/src/components/author/index.js b/website/src/components/author/index.js index a8b7ad7c0ef..6b49295936d 100644 --- a/website/src/components/author/index.js +++ b/website/src/components/author/index.js @@ -4,6 +4,7 @@ import Link from '@docusaurus/Link'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; import BlogLayout from '@theme/BlogLayout'; import getAllPosts from '../../utils/get-all-posts'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; function Author(props) { const { authorData } = props @@ -38,7 +39,7 @@ function Author(props) { itemType="http://schema.org/Person">
      - {name} + {name}

      {name}

      diff --git a/website/src/components/communitySpotlightCard/index.js b/website/src/components/communitySpotlightCard/index.js index 5be1179d620..08707a93dd4 100644 --- a/website/src/components/communitySpotlightCard/index.js +++ b/website/src/components/communitySpotlightCard/index.js @@ -1,6 +1,7 @@ import React from 'react' import Link from '@docusaurus/Link'; import styles from './styles.module.css'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; const SpotlightWrapper = ({ isSpotlightMember, frontMatter, children }) => { return isSpotlightMember ? ( @@ -55,13 +56,13 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) {
      {id && isSpotlightMember ? ( {title} ) : ( {title} @@ -100,7 +101,7 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) {
      )} {description && !isSpotlightMember && ( -

      {truncateText(description)}

      +

      )} {socialLinks && isSpotlightMember && socialLinks?.length > 0 && (

      @@ -137,9 +138,27 @@ function CommunitySpotlightCard({ frontMatter, isSpotlightMember = false }) { // Truncate text function truncateText(str) { // Max length of string - const maxLength = 300 + let maxLength = 300 + + // Check if anchor link starts within first 300 characters + let hasLinks = false + if(str.substring(0, maxLength - 3).match(/(?:)/g) + if(linkText?.length && linkText[0]?.length) { + maxLength += linkText[0]?.length + } + } + + const substring = str.substring(0, maxLength - 3) + return str.length > maxLength - ? `${str.substring(0, maxLength - 3)}...` + ? `${substring}...` : str } diff --git a/website/src/components/communitySpotlightCard/styles.module.css b/website/src/components/communitySpotlightCard/styles.module.css index e28c3fe7b41..253a561ebea 100644 --- a/website/src/components/communitySpotlightCard/styles.module.css +++ b/website/src/components/communitySpotlightCard/styles.module.css @@ -77,8 +77,9 @@ div.spotlightMemberCard { .spotlightMemberCard { padding: 3rem 0; } - div.spotlightMemberCard { + :global(#spotlight-members-section) div.spotlightMemberCard { margin-bottom: 0; + padding-left: 0; } .spotlightMemberCard .spotlightMemberImgContainer { flex: 0 0 346px; diff --git a/website/src/components/communitySpotlightList/index.js b/website/src/components/communitySpotlightList/index.js index d91c257122d..b72d640b74d 100644 --- a/website/src/components/communitySpotlightList/index.js +++ b/website/src/components/communitySpotlightList/index.js @@ -8,12 +8,29 @@ import CommunitySpotlightCard from '../communitySpotlightCard' const communityTitle = 'Community spotlight' const communityDescription = "The dbt Community is where analytics engineering lives and grows, and you're a part of it! Every quarter we'll be highlighting community members in the dbt Community Spotlight. These are individuals who have gone above and beyond to contribute to the community in a variety of ways. We all see you. We appreciate you. You are awesome." +// This date determines where the 'Previously on the Spotlight" text will show. +// Any spotlight members with a 'dateCreated' field before this date +// will be under the 'Previously..' header. +const currentSpotlightDate = new Date('2023-06-01') + function CommunitySpotlightList({ spotlightData }) { const { siteConfig } = useDocusaurusContext() // Build meta title from communityTitle and docusaurus config site title const metaTitle = `${communityTitle}${siteConfig?.title ? ` | ${siteConfig.title}` : ''}` + // Split spotlight members into current and previous + let currentSpotlightMembers = [] + let previousSpotlightMembers = [] + + spotlightData?.map(member => { + if(currentSpotlightDate > new Date(member?.data?.dateCreated)) { + previousSpotlightMembers.push(member) + } else { + currentSpotlightMembers.push(member) + } + }) + return ( @@ -31,11 +48,19 @@ function CommunitySpotlightList({ spotlightData }) { />
      - {spotlightData && spotlightData.length > 0 ? ( + {currentSpotlightMembers?.length || previousSpotlightMembers?.length ? ( <> - {spotlightData.map((member, i) => ( + {currentSpotlightMembers?.map((member, i) => ( ))} + {previousSpotlightMembers?.length ? ( + <> +

      Previously on the Spotlight

      + {previousSpotlightMembers.map((member, i) => ( + + ))} + + ) : ''} ) :

      No community spotlight members are available at this time. 😕

      diff --git a/website/src/components/discourse/index.js b/website/src/components/discourse/index.js index 18e4d3e7254..97ef08a5272 100644 --- a/website/src/components/discourse/index.js +++ b/website/src/components/discourse/index.js @@ -38,10 +38,8 @@ export const DiscourseFeed = ({ setLoading(true) setIsError(false) - // Build Netlify Function endpoint - const endpoint = window?.location?.hostname?.includes('localhost') - ? 'http://localhost:8888/.netlify/functions/get-discourse-topics' - : '/.netlify/functions/get-discourse-topics' + // Build function endpoint + const endpoint = `/api/get-discourse-topics` // If 'after' prop not passed in, set relative after date let afterDate = after diff --git a/website/src/components/discourseBlogComments/index.js b/website/src/components/discourseBlogComments/index.js index 63279285f2a..7684269f92a 100644 --- a/website/src/components/discourseBlogComments/index.js +++ b/website/src/components/discourseBlogComments/index.js @@ -28,10 +28,8 @@ export const DiscourseBlogComments = ({title,slug}) => { const fetchData = async () => { try { - const endpoint = window?.location?.hostname?.includes('localhost') - ? `http://localhost:8888/.netlify/functions/get-discourse-comments?title=${title}&slug=${slug}` - : `/.netlify/functions/get-discourse-comments?title=${title}&slug=${slug}` - + const endpoint = `/api/get-discourse-comments?title=${title}&slug=${slug}` + const { data } = await axios.get(endpoint) // Set error state if data not available diff --git a/website/src/components/lightbox/index.js b/website/src/components/lightbox/index.js index b4c2da3c905..1c748bbb04f 100644 --- a/website/src/components/lightbox/index.js +++ b/website/src/components/lightbox/index.js @@ -1,5 +1,6 @@ import React from 'react'; import styles from './styles.module.css'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; function Lightbox({ src, @@ -35,7 +36,7 @@ function Lightbox({ data-toggle="lightbox" alt={alt ? alt : title ? title : ''} title={title ? title : ''} - src={src} + src={imageCacheWrapper(src)} /> diff --git a/website/src/components/lineage/index.js b/website/src/components/lineage/index.js index eb59178369d..6c22e2bae99 100644 --- a/website/src/components/lineage/index.js +++ b/website/src/components/lineage/index.js @@ -5,11 +5,11 @@ let Dag = null; try { /** As a private package, not every developer will have access to this repo. */ - const DagImport = require('@dbt-labs/react-dbt-dag'); - require('@dbt-labs/react-dbt-dag/dag.css'); - require('@dbt-labs/react-dbt-dag/dag.standalone.css'); + // const DagImport = require('@dbt-labs/react-dbt-dag'); + // require('@dbt-labs/react-dbt-dag/dag.css'); + // require('@dbt-labs/react-dbt-dag/dag.standalone.css'); - Dag = DagImport.Dag; + // Dag = DagImport.Dag; } catch (err) { /** * react-dbt-dag is a private repo. Not all developers of the diff --git a/website/src/components/quickstartTOC/index.js b/website/src/components/quickstartTOC/index.js index 49209273964..8c9b8fba910 100644 --- a/website/src/components/quickstartTOC/index.js +++ b/website/src/components/quickstartTOC/index.js @@ -26,16 +26,6 @@ function QuickstartTOC() { const steps = quickstartContainer.querySelectorAll("h2"); const snippetContainer = document.querySelectorAll(".snippet"); - // Add snippet container to its parent step - snippetContainer.forEach((snippet) => { - const parent = snippet?.parentNode; - while (snippet?.firstChild && parent.className) { - if (parent) { - parent.insertBefore(snippet.firstChild, snippet); - } - } - }); - // Create an array of objects with the id and title of each step const data = Array.from(steps).map((step, index) => ({ id: step.id, @@ -49,6 +39,16 @@ function QuickstartTOC() { // Wrap all h2 (steps), along with all of their direct siblings, in a div until the next h2 if (mounted) { + // Add snippet container to its parent step + snippetContainer.forEach((snippet) => { + const parent = snippet?.parentNode; + while (snippet?.firstChild && parent.className) { + if (parent) { + parent.insertBefore(snippet.firstChild, snippet); + } + } + }); + steps.forEach((step, index) => { const wrapper = document.createElement("div"); wrapper.classList.add(style.stepWrapper); diff --git a/website/src/css/custom.css b/website/src/css/custom.css index c8047407450..3181738406d 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -102,6 +102,14 @@ html[data-theme="dark"] { --ifm-table-cell-color: var(--color-green-blue); } +/* Linked `code` tags visibility adjustment */ +html[data-theme=dark] a code { + color: var(--ifm-link-color); +} +html[data-theme=dark] a code:hover { + color: var(--ifm-link-hover-color);; +} + /* For /dbt-cloud/api REDOC Page */ html[data-theme="dark"] .api-content h2, html[data-theme="dark"] .api-content h3, @@ -228,10 +236,6 @@ code { color: var(--ifm-color-emphasis-900); } -html[data-theme="dark"] a code { - color: var(--color-white); -} - .main-wrapper .home .col>p { font-size: 1.25rem; } diff --git a/website/src/theme/BlogPostItem/Header/Author/index.js b/website/src/theme/BlogPostItem/Header/Author/index.js index a37d9e9985a..f82428df789 100644 --- a/website/src/theme/BlogPostItem/Header/Author/index.js +++ b/website/src/theme/BlogPostItem/Header/Author/index.js @@ -1,6 +1,7 @@ import React from 'react'; import clsx from 'clsx'; import Link from '@docusaurus/Link'; +import imageCacheWrapper from '../../../../../functions/image-cache-wrapper'; function MaybeLink(props) { if (props.href || props.slug) { return ; @@ -21,7 +22,7 @@ export default function BlogPostItemHeaderAuthor({author, className}) {
      {imageURL && ( - {name} + {name} )} diff --git a/website/static/_headers b/website/static/_headers deleted file mode 100644 index f6b636c5158..00000000000 --- a/website/static/_headers +++ /dev/null @@ -1,5 +0,0 @@ -/* - Strict-Transport-Security: max-age=63072000; includeSubDomains; preload - Content-Security-Policy: object-src 'none'; frame-ancestors 'none'; - X-Content-Type-Options: nosniff - X-XSS-Protection: 1; mode=block diff --git a/website/static/assets/beta-tc.pdf b/website/static/assets/beta-tc.pdf new file mode 100644 index 00000000000..f285cf95a55 Binary files /dev/null and b/website/static/assets/beta-tc.pdf differ diff --git a/website/static/img/blog/2023-05-02-modeling-ragged-time-varying-hierarchies/hierarchy.png b/website/static/img/blog/2023-05-02-modeling-ragged-time-varying-hierarchies/hierarchy.png new file mode 100644 index 00000000000..88316f363f2 Binary files /dev/null and b/website/static/img/blog/2023-05-02-modeling-ragged-time-varying-hierarchies/hierarchy.png differ diff --git a/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image1.png b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image1.png new file mode 100644 index 00000000000..687bdef7568 Binary files /dev/null and b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image1.png differ diff --git a/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image2.png b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image2.png new file mode 100644 index 00000000000..658e4c0cfb5 Binary files /dev/null and b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image2.png differ diff --git a/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image3.png b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image3.png new file mode 100644 index 00000000000..fa4b837a82f Binary files /dev/null and b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image3.png differ diff --git a/website/static/img/blog/2023-08-01-announcing-materialized-views/materialized-incremental-twins.jpg b/website/static/img/blog/2023-08-01-announcing-materialized-views/materialized-incremental-twins.jpg new file mode 100644 index 00000000000..bd72dba3663 Binary files /dev/null and b/website/static/img/blog/2023-08-01-announcing-materialized-views/materialized-incremental-twins.jpg differ diff --git a/website/static/img/blog/2023-08-01-announcing-materialized-views/streaming-pipeline.png b/website/static/img/blog/2023-08-01-announcing-materialized-views/streaming-pipeline.png new file mode 100644 index 00000000000..70baf10473e Binary files /dev/null and b/website/static/img/blog/2023-08-01-announcing-materialized-views/streaming-pipeline.png differ diff --git a/website/static/img/blog/authors/pedro_brito.jpeg b/website/static/img/blog/authors/pedro_brito.jpeg new file mode 100644 index 00000000000..9f163a431f3 Binary files /dev/null and b/website/static/img/blog/authors/pedro_brito.jpeg differ diff --git a/website/static/img/blog/authors/sterling-paramore.png b/website/static/img/blog/authors/sterling-paramore.png new file mode 100644 index 00000000000..488bade8abd Binary files /dev/null and b/website/static/img/blog/authors/sterling-paramore.png differ diff --git a/website/static/img/community/spotlight/alan-cruickshank.jpg b/website/static/img/community/spotlight/alan-cruickshank.jpg new file mode 100644 index 00000000000..bafc53aeb9d Binary files /dev/null and b/website/static/img/community/spotlight/alan-cruickshank.jpg differ diff --git a/website/static/img/community/spotlight/fabiyi-opeyemi.jpg b/website/static/img/community/spotlight/fabiyi-opeyemi.jpg new file mode 100644 index 00000000000..f1ac40dfa6d Binary files /dev/null and b/website/static/img/community/spotlight/fabiyi-opeyemi.jpg differ diff --git a/website/static/img/community/spotlight/faith-lierheimer.jpg b/website/static/img/community/spotlight/faith-lierheimer.jpg new file mode 100644 index 00000000000..5ec1dc39719 Binary files /dev/null and b/website/static/img/community/spotlight/faith-lierheimer.jpg differ diff --git a/website/static/img/community/spotlight/jing-lim.jpg b/website/static/img/community/spotlight/jing-lim.jpg new file mode 100644 index 00000000000..7f7964d3bc6 Binary files /dev/null and b/website/static/img/community/spotlight/jing-lim.jpg differ diff --git a/website/static/img/community/spotlight/josh-devlin.jpg b/website/static/img/community/spotlight/josh-devlin.jpg new file mode 100644 index 00000000000..58e2b4da854 Binary files /dev/null and b/website/static/img/community/spotlight/josh-devlin.jpg differ diff --git a/website/static/img/community/spotlight/owen-prough.jpg b/website/static/img/community/spotlight/owen-prough.jpg new file mode 100644 index 00000000000..83e8c82cef5 Binary files /dev/null and b/website/static/img/community/spotlight/owen-prough.jpg differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/catalog-sidebar-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/catalog-sidebar-v1.gif new file mode 100644 index 00000000000..458aa8e874d Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/catalog-sidebar-v1.gif differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/lineage-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/lineage-v1.gif new file mode 100644 index 00000000000..2772eaa9619 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/lineage-v1.gif differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/model-resource-details-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/model-resource-details-v1.gif new file mode 100644 index 00000000000..24c8312af11 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/model-resource-details-v1.gif differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/search-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/search-v1.gif new file mode 100644 index 00000000000..1343f58171d Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/search-v1.gif differ diff --git a/website/static/img/docs/dbt-cloud/deployment/run-overview.jpg b/website/static/img/docs/dbt-cloud/deployment/run-overview.jpg new file mode 100644 index 00000000000..8ab14b8ce2b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/run-overview.jpg differ diff --git a/website/static/img/docs/dbt-cloud/disconnect-repo.gif b/website/static/img/docs/dbt-cloud/disconnect-repo.gif new file mode 100644 index 00000000000..135ae789fa8 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/disconnect-repo.gif differ diff --git a/website/static/img/docs/dbt-cloud/on-premises/disconnect-repo.gif b/website/static/img/docs/dbt-cloud/on-premises/disconnect-repo.gif new file mode 100644 index 00000000000..135ae789fa8 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/on-premises/disconnect-repo.gif differ diff --git a/website/static/img/docs/dbt-cloud/on-premises/self-signed-cert.png b/website/static/img/docs/dbt-cloud/on-premises/self-signed-cert.png deleted file mode 100644 index 08ea839b002..00000000000 Binary files a/website/static/img/docs/dbt-cloud/on-premises/self-signed-cert.png and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/redshiftserverless.png b/website/static/img/docs/dbt-cloud/redshiftserverless.png new file mode 100644 index 00000000000..57b1dbb3cf7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftserverless.png differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/new-sl-configure.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/new-sl-configure.jpg new file mode 100644 index 00000000000..9e624693aa0 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/new-sl-configure.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-architecture.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-architecture.jpg new file mode 100644 index 00000000000..b6801e88bce Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-architecture.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-example.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-example.jpg new file mode 100644 index 00000000000..d73b6167dba Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-example.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg new file mode 100644 index 00000000000..41fe17c7654 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-job-adv-settings.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-job-adv-settings.png new file mode 100644 index 00000000000..1ef43a9588e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-job-adv-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png new file mode 100644 index 00000000000..7da23bd1dc9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-deploy-job.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-deploy-job.png new file mode 100644 index 00000000000..88b8047fef5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-deploy-job.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/deploy-job-adv-settings.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/deploy-job-adv-settings.png new file mode 100644 index 00000000000..8ed834a23bc Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/deploy-job-adv-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/extended-attributes.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/extended-attributes.jpg new file mode 100644 index 00000000000..3b5929c3141 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/extended-attributes.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.jpg new file mode 100644 index 00000000000..6eb689b0cdd Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.jpg differ diff --git a/website/static/img/docs/release-notes/ci-job-setup.gif b/website/static/img/docs/release-notes/ci-job-setup.gif new file mode 100644 index 00000000000..56beea4ab88 Binary files /dev/null and b/website/static/img/docs/release-notes/ci-job-setup.gif differ diff --git a/website/static/img/docs/release-notes/ci-job-tag.png b/website/static/img/docs/release-notes/ci-job-tag.png new file mode 100644 index 00000000000..02f2cdc895c Binary files /dev/null and b/website/static/img/docs/release-notes/ci-job-tag.png differ diff --git a/website/static/img/docs/release-notes/dbt-cloud-versions.png b/website/static/img/docs/release-notes/dbt-cloud-versions.png new file mode 100644 index 00000000000..26c9f5fa0a2 Binary files /dev/null and b/website/static/img/docs/release-notes/dbt-cloud-versions.png differ diff --git a/website/static/img/docs/terms/data-lineage/dag_example.jpg b/website/static/img/docs/terms/data-lineage/dag_example.jpg new file mode 100644 index 00000000000..3d1e4153590 Binary files /dev/null and b/website/static/img/docs/terms/data-lineage/dag_example.jpg differ diff --git a/website/static/img/guides/best-practices/semantic-layer/orders_erd.png b/website/static/img/guides/best-practices/semantic-layer/orders_erd.png new file mode 100644 index 00000000000..56e35256d83 Binary files /dev/null and b/website/static/img/guides/best-practices/semantic-layer/orders_erd.png differ diff --git a/website/static/img/icons/athena.svg b/website/static/img/icons/athena.svg new file mode 100644 index 00000000000..c2c6a81dd64 --- /dev/null +++ b/website/static/img/icons/athena.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/teradata.svg b/website/static/img/icons/teradata.svg new file mode 100644 index 00000000000..cbbfab92d66 --- /dev/null +++ b/website/static/img/icons/teradata.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/athena.svg b/website/static/img/icons/white/athena.svg new file mode 100644 index 00000000000..c2c6a81dd64 --- /dev/null +++ b/website/static/img/icons/white/athena.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/teradata.svg b/website/static/img/icons/white/teradata.svg new file mode 100644 index 00000000000..cbbfab92d66 --- /dev/null +++ b/website/static/img/icons/white/teradata.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/vercel.json b/website/vercel.json new file mode 100644 index 00000000000..c5fb0638fba --- /dev/null +++ b/website/vercel.json @@ -0,0 +1,3996 @@ +{ + "cleanUrls": true, + "trailingSlash": false, + "redirects": [ + { + "source": "/docs/deploy/job-triggers", + "destination": "/docs/deploy/deploy-jobs", + "permanent": true + }, + { + "source": "/docs/deploy/job-settings", + "destination": "/docs/deploy/deploy-jobs", + "permanent": true + }, + { + "source": "/docs/deploy/dbt-cloud-job", + "destination": "/docs/deploy/deploy-jobs", + "permanent": true + }, + { + "source": "/faqs/environments/beta-release", + "destination": "/docs/dbt-versions/product-lifecycles", + "permanent": true + }, + { + "source": "/docs/deploy/slim-ci-jobs", + "destination": "/docs/deploy/ci-jobs", + "permanent": true + }, + { + "source": "/guides/dbt-ecosystem/sl-partner-integration-guide", + "destination": "/docs/use-dbt-semantic-layer/avail-sl-integrations", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/dbt-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/dbt-sl", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/set-up-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/setup-sl", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/setup-dbt-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/setup-sl", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/quickstart-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/quickstart-sl", + "permanent": true + }, + { + "source": "/docs/collaborate/environments/environments-in-dbt", + "destination": "/docs/environments-in-dbt", + "permanent": true + }, + { + "source": "/docs/collaborate/environments/dbt-cloud-environments", + "destination": "/docs/deploy/dbt-cloud-environments", + "permanent": true + }, + { + "source": "/docs/collaborate/environments/dbt-core-environments", + "destination": "/docs/core/dbt-core-environments", + "permanent": true + }, + { + "source": "/docs/cloud/manage-access/licenses-and-groups", + "destination": "/docs/cloud/manage-access/about-user-access", + "permanent": true + }, + { + "source": "/docs/deploy/cloud-ci-job", + "destination": "/docs/deploy/continuous-integration", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/bigquery", + "destination": "/quickstarts/bigquery", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/databricks", + "destination": "/quickstarts/databricks", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/redshift", + "destination": "/quickstarts/redshift", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/snowflake", + "destination": "/quickstarts/snowflake", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/starburst-galaxy", + "destination": "/quickstarts/starburst-galaxy", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-core/codespace", + "destination": "/quickstarts/codespace", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-core/manual-install", + "destination": "/quickstarts/manual-install", + "permanent": true + }, + { + "source": "/docs/deploy/project-state", + "destination": "/reference/node-selection/syntax", + "permanent": true + }, + { + "source": "/reference/global-configs", + "destination": "/reference/global-configs/about-global-configs", + "permanent": true + }, + { + "source": "/docs/quickstarts/overview", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#verified-adapters", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#community-adapters", + "destination": "/docs/community-adapters", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#adapter-installation", + "destination": "/docs/connect-adapters", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#adapter-taxonomy", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#verified-by-dbt-labs", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#maintainers", + "destination": "/docs/connect-adapters#maintainers", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#contributing-to-dbt-core-adapters", + "destination": "/docs/contribute-core-adapters", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#contributing-to-a-pre-existing-adapter", + "destination": "/docs/contribute-core-adapters#contribute-to-a-pre-existing-adapter", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#creating-a-new-adapter", + "destination": "/docs/contribute-core-adapters#create-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/core/connection-profiles", + "destination": "/docs/core/connect-data-platform/connection-profiles", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/bigquery-setup", + "destination": "/docs/core/connect-data-platform/bigquery-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/postgres-setup", + "destination": "/docs/core/connect-data-platform/postgres-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/redshift-setup", + "destination": "/docs/core/connect-data-platform/redshift-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/snowflake-setup", + "destination": "/docs/core/connect-data-platform/snowflake-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/mssql-setup", + "destination": "/docs/core/connect-data-platform/mssql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/trino-setup", + "destination": "/docs/core/connect-data-platform/trino-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/singlestore-setup", + "destination": "/docs/core/connect-data-platform/singlestore-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/spark-setup", + "destination": "/docs/core/connect-data-platform/spark-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/databricks-setup", + "destination": "/docs/core/connect-data-platform/databricks-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/hive-setup", + "destination": "/docs/core/connect-data-platform/hive-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/exasol-setup", + "destination": "/docs/core/connect-data-platform/exasol-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/oracle-setup", + "destination": "/docs/core/connect-data-platform/oracle-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/azuresynapse-setup", + "destination": "/docs/core/connect-data-platform/azuresynapse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/dremio-setup", + "destination": "/docs/core/connect-data-platform/dremio-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/clickhouse-setup", + "destination": "/docs/core/connect-data-platform/clickhouse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/materialize-setup", + "destination": "/docs/core/connect-data-platform/materialize-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/rockset-setup", + "destination": "/docs/core/connect-data-platform/rockset-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/firebolt-setup", + "destination": "/docs/core/connect-data-platform/firebolt-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/teradata-setup", + "destination": "/docs/core/connect-data-platform/teradata-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/athena-setup", + "destination": "/docs/core/connect-data-platform/athena-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/vertica-setup", + "destination": "/docs/core/connect-data-platform/vertica-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/tidb-setup", + "destination": "/docs/core/connect-data-platform/tidb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/glue-setup", + "destination": "/docs/core/connect-data-platform/glue-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/mindsdb-setup", + "destination": "/docs/core/connect-data-platform/mindsdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/greenplum-setup", + "destination": "/docs/core/connect-data-platform/greenplum-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/impala-setup", + "destination": "/docs/core/connect-data-platform/impala-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/layer-setup", + "destination": "/docs/core/connect-data-platform/layer-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/iomete-setup", + "destination": "/docs/core/connect-data-platform/iomete-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/duckdb-setup", + "destination": "/docs/core/connect-data-platform/duckdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/sqlite-setup", + "destination": "/docs/core/connect-data-platform/sqlite-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/mysql-setup", + "destination": "/docs/core/connect-data-platform/mysql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/ibmdb2-setup", + "destination": "/docs/core/connect-data-platform/ibmdb2-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/alloydb-setup", + "destination": "/docs/core/connect-data-platform/alloydb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/doris-setup", + "destination": "/docs/core/connect-data-platform/doris-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/infer-setup", + "destination": "/docs/core/connect-data-platform/infer-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/databend-setup", + "destination": "/docs/core/connect-data-platform/databend-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/fal-setup", + "destination": "/docs/core/connect-data-platform/fal-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/decodable-setup", + "destination": "/docs/core/connect-data-platform/decodable-setup", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-source", + "destination": "/docs/dbt-cloud-apis/discovery-schema-source", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-sources", + "destination": "/docs/dbt-cloud-apis/discovery-schema-sources", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-test", + "destination": "/docs/dbt-cloud-apis/discovery-schema-test", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-tests", + "destination": "/docs/dbt-cloud-apis/discovery-schema-tests", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-seed", + "destination": "/docs/dbt-cloud-apis/discovery-schema-seed", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-seeds", + "destination": "/docs/dbt-cloud-apis/discovery-schema-seeds", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-snapshots", + "destination": "/docs/dbt-cloud-apis/discovery-schema-snapshots", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-model", + "destination": "/docs/dbt-cloud-apis/discovery-schema-model", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-models", + "destination": "/docs/dbt-cloud-apis/discovery-schema-models", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-modelByEnv", + "destination": "/docs/dbt-cloud-apis/discovery-schema-modelByEnv", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-metrics", + "destination": "/docs/dbt-cloud-apis/discovery-schema-metrics", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-metric", + "destination": "/docs/dbt-cloud-apis/discovery-schema-metric", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-exposures", + "destination": "/docs/dbt-cloud-apis/discovery-schema-exposures", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-exposure", + "destination": "/docs/dbt-cloud-apis/discovery-schema-exposure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-use-case-guides", + "destination": "/docs/dbt-cloud-apis/discovery-use-cases-and-examples", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-api", + "destination": "/docs/dbt-cloud-apis/discovery-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-querying", + "destination": "/docs/dbt-cloud-apis/discovery-querying", + "permanent": true + }, + { + "source": "/docs/core/connection-profiles#understanding-threads", + "destination": "/docs/running-a-dbt-project/using-threads", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/about-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/snowflake-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/redshift-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/databricks-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/ip-restrictions", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/deploy/dbt-cloud-job#create-and-schedule-jobs", + "destination": "/docs/deploy/dbt-cloud-job#create-and-schedule-jobs", + "permanent": true + }, + { + "source": "/docs/cloud/dbt-cloud-tips", + "destination": "/docs/cloud/dbt-cloud-ide/dbt-cloud-tips", + "permanent": true + }, + { + "source": "/docs/cloud/develop-in-the-cloud", + "destination": "/docs/cloud/dbt-cloud-ide/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab", + "destination": "/docs/deploy/dbt-cloud-job#model-timing", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-core/quickstart", + "destination": "/quickstarts/manual-install", + "permanent": true + }, + { + "source": "/docs/dbt-versions/release-notes/January-2022/model-timing-more", + "destination": "/docs/deploy/dbt-cloud-job#model-timing", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#dbt-cloud", + "destination": "/docs/deploy/dbt-cloud-job", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#airflow", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#prefect", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#run-dbt-in-production", + "destination": "/docs/deploy/deployments", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#on-prefect-2", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#on-prefect-1", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#dagster", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#automation-servers", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#cron", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/enterprise-permissions#permission-sets", + "destination": "/docs/cloud/manage-access/enterprise-permissions#permission-sets", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/about-privatelink", + "destination": "/docs/cloud/privatelink/about-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/snowflake-privatelink", + "destination": "/docs/cloud/privatelink/snowflake-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/redshift-privatelink", + "destination": "/docs/cloud/privatelink/redshift-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/databricks-privatelink", + "destination": "/docs/cloud/privatelink/databricks-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/dbt-cloud-features", + "destination": "/docs/cloud/about-cloud/dbt-cloud-features", + "permanent": true + }, + { + "source": "/docs/deploy/regions-ip-addresses", + "destination": "/docs/cloud/about-cloud/regions-ip-addresses", + "permanent": true + }, + { + "source": "/docs/deploy/architecture", + "destination": "/docs/cloud/about-cloud/architecture", + "permanent": true + }, + { + "source": "/docs/deploy/single-tenant", + "destination": "/docs/cloud/about-cloud/tenancy", + "permanent": true + }, + { + "source": "/docs/deploy/multi-tenant", + "destination": "/docs/cloud/about-cloud/tenancy", + "permanent": true + }, + { + "source": "/docs/cloud/manage-access/about-access", + "destination": "/docs/cloud/manage-access/about-user-access", + "permanent": true + }, + { + "source": "/docs/collaborate/git/connect-github", + "destination": "/docs/cloud/git/connect-github", + "permanent": true + }, + { + "source": "/docs/collaborate/git/connect-gitlab", + "destination": "/docs/cloud/git/connect-gitlab", + "permanent": true + }, + { + "source": "/docs/collaborate/git/connect-azure-devops", + "destination": "/docs/cloud/git/connect-azure-devops", + "permanent": true + }, + { + "source": "/docs/collaborate/git/setup-azure", + "destination": "/docs/cloud/git/setup-azure", + "permanent": true + }, + { + "source": "/docs/collaborate/git/authenticate-azure", + "destination": "/docs/cloud/git/authenticate-azure", + "permanent": true + }, + { + "source": "/docs/collaborate/git/import-a-project-by-git-url", + "destination": "/docs/cloud/git/import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/about-publishing-models", + "destination": "/docs/collaborate/govern/about-model-governance", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/model-contracts", + "destination": "/docs/collaborate/govern/model-contracts", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/model-access", + "destination": "/docs/collaborate/govern/model-access", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/model-versions", + "destination": "/docs/collaborate/govern/model-versions", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/about-access", + "destination": "/docs/cloud/manage-access/about-user-access", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/seats-and-users", + "destination": "/docs/cloud/manage-access/seats-and-users", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/self-service-permissions", + "destination": "/docs/cloud/manage-access/self-service-permissions", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/enterprise-permissions", + "destination": "/docs/cloud/manage-access/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/sso-overview", + "destination": "/docs/cloud/manage-access/sso-overview", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-saml-2.0", + "destination": "/docs/cloud/manage-access/set-up-sso-saml-2.0", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-okta", + "destination": "/docs/cloud/manage-access/set-up-sso-okta", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-google-workspace", + "destination": "/docs/cloud/manage-access/set-up-sso-google-workspace", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-azure-active-directory", + "destination": "/docs/cloud/manage-access/set-up-sso-azure-active-directory", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-snowflake-oauth", + "destination": "/docs/cloud/manage-access/set-up-snowflake-oauth", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-bigquery-oauth", + "destination": "/docs/cloud/manage-access/set-up-bigquery-oauth", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/audit-log", + "destination": "/docs/cloud/manage-access/audit-log", + "permanent": true + }, + { + "source": "/docs/get-started/develop-in-the-cloud", + "destination": "/docs/cloud/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/get-started/dbt-cloud-tips", + "destination": "/docs/cloud/dbt-cloud-tips", + "permanent": true + }, + { + "source": "/docs/get-started/installation", + "destination": "/docs/core/installation", + "permanent": true + }, + { + "source": "/docs/get-started/about-the-cli", + "destination": "/docs/core/about-the-cli", + "permanent": true + }, + { + "source": "/docs/get-started/homebrew-install", + "destination": "/docs/core/homebrew-install", + "permanent": true + }, + { + "source": "/docs/get-started/pip-install", + "destination": "/docs/core/pip-install", + "permanent": true + }, + { + "source": "/docs/get-started/docker-install", + "destination": "/docs/core/docker-install", + "permanent": true + }, + { + "source": "/docs/get-started/source-install", + "destination": "/docs/core/source-install", + "permanent": true + }, + { + "source": "/docs/get-started/connection-profiles", + "destination": "/docs/core/connection-profiles", + "permanent": true + }, + { + "source": "/docs/get-started/run-your-dbt-projects", + "destination": "/docs/running-a-dbt-project/run-your-dbt-projects", + "permanent": true + }, + { + "source": "/docs/get-started/learning-more/refactoring-legacy-sql", + "destination": "/guides/migration/tools/refactoring-legacy-sql", + "permanent": true + }, + { + "source": "/docs/get-started/learning-more/using-jinja", + "destination": "/guides/advanced/using-jinja", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/set-up-dbt-cloud", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-snowflake", + "destination": "/docs/quickstarts/dbt-cloud/snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-redshift", + "destination": "/docs/quickstarts/dbt-cloud/redshift", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-databricks", + "destination": "/quickstarts/databricks", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-bigquery", + "destination": "/docs/quickstarts/dbt-cloud/bigquery", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-databricks", + "destination": "/quickstarts/databricks", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-redshift", + "destination": "/docs/quickstarts/dbt-cloud/redshift", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-snowflake", + "destination": "/docs/quickstarts/dbt-cloud/snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project/schedule-a-job", + "destination": "/quickstarts/bigquery", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project", + "destination": "/docs/quickstarts/dbt-cloud/bigquery#add-tests-to-your-models", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "destination": "/quickstarts/bigquery?step=8", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/overview", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started-dbt-core", + "destination": "/docs/quickstarts/dbt-core/quickstart", + "permanent": true + }, + { + "source": "/docs/get-started/develop-in-the-cloud#set-up-environments", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/get-started/develop-in-the-cloud#developer-credentials", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/getting-started/develop-in-the-cloud#setting-up-developer-credentials", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database#connecting-to-redshift-and-postgres", + "destination": "/docs/get-started/connect-your-database#connecting-to-postgres-redshift-and-alloydb", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database#connecting-to-snowflake", + "destination": "/docs/get-started/connect-your-database#connecting-to-snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/connect-your-database#connecting-to-snowflake", + "destination": "/docs/cloud/connect-data-platform/connect-snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/connect-your-database#connecting-to-postgres-redshift-and-alloydb", + "destination": "/cloud/connect-data-platform/connect-redshift-postgresql-alloydb", + "permanent": true + }, + { + "source": "/docs/cloud/connect-data-platform/connect-your-database", + "destination": "/docs/cloud/connect-data-platform/about-connections", + "permanent": true + }, + { + "source": "/faqs/connecting-to-two-dbs-not-allowed", + "destination": "/faqs/warehouse/connecting-to-two-dbs-not-allowed", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/ide-beta", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/bigquery", + "destination": "/quickstarts/bigquery", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/redshift", + "destination": "/quickstarts/redshift", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/snowflake", + "destination": "/quickstarts/snowflake", + "permanent": true + }, + { + "source": "/quickstarts/starburst-galaxy", + "destination": "/quickstarts/starburst-galaxy", + "permanent": true + }, + { + "source": "/quickstarts/codespace", + "destination": "/quickstarts/codespace", + "permanent": true + }, + { + "source": "/quickstarts/manual-install", + "destination": "/quickstarts/manual-install", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab", + "destination": "/docs/get-started/dbt-cloud-features#model-timing-dashboard", + "permanent": true + }, + { + "source": "/docs/dbt-cloud", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "destination": "/docs/dbt-versions/upgrade-core-in-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-overview", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/artifacts", + "destination": "/docs/deploy/artifacts", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/python-models", + "destination": "/docs/build/python-models", + "permanent": true + }, + { + "source": "/docs/deploy/regions", + "destination": "/docs/deploy/regions-ip-addresses", + "permanent": true + }, + { + "source": "/advanced/adapter-development/1-what-are-adapters", + "destination": "/guides/dbt-ecosystem/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/3-building-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/4-testing-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/5-documenting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/6-promoting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/7-verifying-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/1-what-are-adapters", + "destination": "/guides/dbt-ecosystem/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/4-testing-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/5-documenting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/6-promoting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/7-verifying-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/legacy/debugging-errors", + "destination": "/guides/best-practices/debugging-errors", + "permanent": true + }, + { + "source": "/guides/legacy/writing-custom-generic-tests", + "destination": "/guides/best-practices/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/guides/legacy/creating-new-materializations", + "destination": "/guides/advanced/creating-new-materializations", + "permanent": true + }, + { + "source": "/guides/getting-started", + "destination": "/docs/get-started/getting-started/overview", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/create-a-project", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project/build-your-first-models", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project/schedule-a-job", + "destination": "/docs/get-started/getting-started/building-your-first-project/schedule-a-job", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project/test-and-document-your-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project", + "permanent": true + }, + { + "source": "/guides/getting-started/create-a-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models301", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-bigquery", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-bigquery", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-databricks", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-databricks", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-redshift", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-redshift", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-snowflake", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-snowflake", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-started", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more", + "destination": "/docs/get-started/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more/getting-started-dbt-core", + "destination": "/docs/get-started/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more/refactoring-legacy-sql", + "destination": "/docs/get-started/learning-more/refactoring-legacy-sql", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more/using-jinja", + "destination": "/docs/get-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-quickstart", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/cloud-quickstart", + "destination": "/docs/dbt-cloud/cloud-quickstart", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database", + "destination": "/docs/cloud/connect-data-platform/about-connections", + "permanent": true + }, + { + "source": "/docs/get-started/connect-your-database", + "destination": "/docs/cloud/connect-data-platform/about-connections", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/profile", + "destination": "/docs/get-started/connection-profiles", + "permanent": true + }, + { + "source": "/guides/best-practices/materializations/guides/best-practices/materializations/1-overview", + "destination": "/guides/best-practices/materializations/1-guide-overview", + "permanent": true + }, + { + "source": "/docs/deploy/understanding-state", + "destination": "/docs/deploy/about-state", + "permanent": true + }, + { + "source": "/guides/legacy/understanding-state", + "destination": "/docs/deploy/about-state", + "permanent": true + }, + { + "source": "/guides/migration/versions/Older%20versions/understanding-state", + "destination": "/docs/deploy/about-state", + "permanent": true + }, + { + "source": "/docs/collaborate/git/resolve-merge-conflicts", + "destination": "/docs/collaborate/git/merge-conflicts", + "permanent": true + }, + { + "source": "/docs/collaborate/environments", + "destination": "/docs/collaborate/environments/environments-in-dbt", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/running-dbt-in-production", + "destination": "/docs/deploy/deployments", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications", + "destination": "/docs/deploy/job-notifications", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud", + "destination": "/docs/develop/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/january-2020-pricing-updates", + "destination": "https://www.getdbt.com/pricing/", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise", + "destination": "https://www.getdbt.com/pricing/", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/archival", + "destination": "/docs/build/snapshots", + "permanent": true + }, + { + "source": "/docs/about/license", + "destination": "/community/resources/contributor-license-agreements", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository", + "destination": "/docs/collaborate/git/managed-repository", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/release-notes", + "destination": "/docs/dbt-versions/dbt-cloud-release-notes", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/audit-log", + "destination": "/docs/collaborate/manage-access/audit-log", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth", + "destination": "/docs/collaborate/manage-access/set-up-bigquery-oauth", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-snowflake-oauth", + "destination": "/docs/collaborate/manage-access/set-up-snowflake-oauth", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-okta", + "destination": "/docs/collaborate/manage-access/set-up-sso-okta", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-sso-with-azure-active-directory", + "destination": "/docs/collaborate/manage-access/set-up-sso-azure-active-directory", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-google-gsuite", + "destination": "/docs/collaborate/manage-access/set-up-sso-google-workspace", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-saml-2.0", + "destination": "/docs/collaborate/manage-access/set-up-sso-saml-2.0", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/sso-overview", + "destination": "/docs/collaborate/manage-access/sso-overview", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/enterprise-permissions", + "destination": "/docs/collaborate/manage-access/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/self-service-permissions", + "destination": "/docs/collaborate/manage-access/self-service-permissions", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/cloud-seats-and-users", + "destination": "/docs/collaborate/manage-access/seats-and-users", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/access-control-overview", + "destination": "/docs/collaborate/manage-access/about-access", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-generating-documentation", + "destination": "/docs/collaborate/build-and-view-your-docs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/documentation", + "destination": "/docs/collaborate/documentation", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/managing-environments", + "destination": "/docs/collaborate/environments/environments-in-dbt", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-import-a-project-by-git-url", + "destination": "/docs/collaborate/git/import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/authenticate-azure", + "destination": "/docs/collaborate/git/authenticate-azure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/setup-azure", + "destination": "/docs/collaborate/git/setup-azure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-azure-devops", + "destination": "/docs/collaborate/git/connect-azure-devops", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-gitlab", + "destination": "/docs/collaborate/git/connect-gitlab", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-installing-the-github-application", + "destination": "/docs/collaborate/git/connect-github", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/setting-up", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/handling-merge-conflicts", + "destination": "/docs/collaborate/git/resolve-merge-conflicts", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", + "destination": "/docs/collaborate/cloud-build-and-view-your-docs", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-configuring-repositories", + "destination": "/docs/collaborate/git/pr-template", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration", + "destination": "/docs/deploy/cloud-ci-job", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-dashboard-status-tiles", + "destination": "/docs/deploy/dashboard-status-tiles", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness", + "destination": "/docs/deploy/source-freshness", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-notifications", + "destination": "/docs/deploy/job-notifications", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-using-a-custom-cron-schedule", + "destination": "/docs/deploy/job-triggers", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/airgapped-deployment", + "destination": "/docs/deploy/airgapped-deployment", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/single-tenant-deployment", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/multi-tenant-deployment", + "destination": "/docs/deploy/multi-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/deployment-architecture", + "destination": "/docs/deploy/architecture", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/deployment-overview", + "destination": "/docs/deploy/deployments", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-setting-a-custom-target-name", + "destination": "/docs/build/custom-target-names", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-aliases", + "destination": "/docs/build/custom-aliases", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-databases", + "destination": "/docs/build/custom-databases", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-schemas", + "destination": "/docs/build/custom-schemas", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-exposures", + "destination": "/docs/dbt-cloud-apis/metadata-schema-exposures", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-exposure", + "destination": "/docs/dbt-cloud-apis/metadata-schema-exposure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-tests", + "destination": "/docs/dbt-cloud-apis/metadata-schema-tests", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-test", + "destination": "/docs/dbt-cloud-apis/metadata-schema-test", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-snapshots", + "destination": "/docs/dbt-cloud-apis/metadata-schema-snapshots", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-seeds", + "destination": "/docs/dbt-cloud-apis/metadata-schema-seeds", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-seed", + "destination": "/docs/dbt-cloud-apis/metadata-schema-seed", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-sources", + "destination": "/docs/dbt-cloud-apis/metadata-schema-sources", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-source", + "destination": "/docs/dbt-cloud-apis/metadata-schema-source", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-metrics", + "destination": "/docs/dbt-cloud-apis/metadata-schema-metrics", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-metric", + "destination": "/docs/dbt-cloud-apis/metadata-schema-metric", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-modelByEnv", + "destination": "/docs/dbt-cloud-apis/metadata-schema-modelByEnv", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-models", + "destination": "/docs/dbt-cloud-apis/metadata-schema-models", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-model", + "destination": "/docs/dbt-cloud-apis/metadata-schema-model", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/metadata-querying", + "destination": "/docs/dbt-cloud-apis/metadata-querying", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/metadata-overview", + "destination": "/docs/dbt-cloud-apis/metadata-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/admin-cloud-api", + "destination": "/docs/dbt-cloud-apis/admin-cloud-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/service-tokens", + "destination": "/docs/dbt-cloud-apis/service-tokens", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/user-tokens", + "destination": "/docs/dbt-cloud-apis/user-tokens", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/cloud-apis", + "destination": "/docs/dbt-cloud-apis/overview", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/hooks-operations", + "destination": "/docs/build/hooks-operations", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/analyses", + "destination": "/docs/build/analyses", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/package-management", + "destination": "/docs/build/packages", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-environment-variables", + "destination": "/docs/build/environment-variables", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-variables", + "destination": "/docs/build/project-variables", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/jinja-macros", + "destination": "/docs/build/jinja-macros", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/configuring-incremental-models", + "destination": "/docs/build/incremental-models", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/materializations", + "destination": "/docs/build/materializations", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/tests", + "destination": "/docs/build/tests", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/metrics", + "destination": "/docs/build/metrics", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/exposures", + "destination": "/docs/build/exposures", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/snapshots", + "destination": "/docs/build/snapshots", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/seeds", + "destination": "/docs/build/seeds", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models", + "destination": "/docs/build/sql-models", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/using-sources", + "destination": "/docs/build/sources", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/projects", + "destination": "/docs/build/projects", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/python-models", + "destination": "/docs/build/python-models", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/macros", + "destination": "/docs/guides/building-packages", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/setting-up", + "destination": "/docs/guides/building-packages", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-jinja-functions", + "destination": "/docs/guides/building-packages", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions", + "destination": "/docs/dbt-versions/upgrade-core-in-cloud", + "permanent": true + }, + { + "source": "/docs/core-versions", + "destination": "/docs/dbt-versions/core", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-dbt-cloud-support", + "destination": "/docs/dbt-support", + "permanent": true + }, + { + "source": "/docs/about/viewpoint", + "destination": "/community/resources/viewpoint", + "permanent": true + }, + { + "source": "/docs/viewpoint", + "destination": "/community/resources/viewpoint", + "permanent": true + }, + { + "source": "/dbt-cli/configure-your-profile", + "destination": "/docs/get-started/connection-profiles", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-cli", + "destination": "/docs/get-started/about-the-cli", + "permanent": true + }, + { + "source": "/dbt-cli/install/from-source", + "destination": "/docs/get-started/source-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/docker", + "destination": "/docs/get-started/docker-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/pip", + "destination": "/docs/get-started/pip-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/homebrew", + "destination": "/docs/get-started/homebrew-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/overview", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/the-dbt-ide", + "destination": "/docs/get-started/dbt-cloud-features", + "permanent": true + }, + { + "source": "/((?!useful).*components)", + "destination": "https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/adding-page-components.md", + "permanent": true + }, + { + "source": "/guides/legacy/managing-environments", + "destination": "/docs/building-a-dbt-project/managing-environments", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/dbt-api", + "destination": "/docs/introduction", + "permanent": true + }, + { + "source": "/img/docs/dbt-cloud/dbt-cloud-enterprise/icon.png", + "destination": "https://www.getdbt.com/ui/img/dbt-icon.png", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/centos", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/centos", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/install-from-source", + "destination": "/dbt-cli/install/from-source", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/macos", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/ubuntu-debian", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/windows", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/docs", + "destination": "/docs/introduction", + "permanent": true + }, + { + "source": "/docs/adapter", + "destination": "/docs/writing-code-in-dbt/jinja-context/adapter", + "permanent": true + }, + { + "source": "/docs/analyses", + "destination": "/docs/building-a-dbt-project/analyses", + "permanent": true + }, + { + "source": "/docs/api-variable", + "destination": "/docs/writing-code-in-dbt/api-variable", + "permanent": true + }, + { + "source": "/docs/archival", + "destination": "/docs/building-a-dbt-project/archival", + "permanent": true + }, + { + "source": "/docs/artifacts", + "destination": "/docs/dbt-cloud/using-dbt-cloud/artifacts", + "permanent": true + }, + { + "source": "/docs/bigquery-configs", + "destination": "/reference/resource-configs/bigquery-configs", + "permanent": true + }, + { + "source": "/reference/resource-properties/docs", + "destination": "/reference/resource-configs/docs", + "permanent": true + }, + { + "source": "/reference/resource-properties/latest-version", + "destination": "/reference/resource-properties/latest_version", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/bigquery-configs", + "destination": "/reference/resource-configs/bigquery-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/configuring-models", + "destination": "/reference/model-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/enable-and-disable-models", + "destination": "/reference/resource-configs/enabled", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/redshift-configs", + "destination": "/reference/resource-configs/redshift-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/snowflake-configs", + "destination": "/reference/resource-configs/snowflake-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/spark-configs", + "destination": "/reference/resource-configs/spark-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/tags", + "destination": "/reference/resource-configs/tags", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-sql-headers", + "destination": "/reference/resource-configs/sql_header", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects", + "destination": "/docs/building-a-dbt-project/projects", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/configuring-query-comments", + "destination": "/reference/project-configs/query-comment", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/configuring-quoting", + "destination": "/reference/project-configs/quoting", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/creating-a-project", + "destination": "/docs/building-a-dbt-project/projects#creating-a-dbt-project", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/requiring-specific-dbt-versions", + "destination": "/reference/project-configs/require-dbt-version", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/use-an-existing-project", + "destination": "/docs/building-a-dbt-project/projects#using-an-existing-project", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/hooks", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/documentation", + "destination": "/docs/building-a-dbt-project/testing-and-documentation/documentation", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/documentation-website", + "destination": "/docs/building-a-dbt-project/testing-and-documentation/documentation", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/schemayml-files", + "destination": "/reference/declaring-properties", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/testing", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/using-operations", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/building-models", + "destination": "/docs/building-a-dbt-project/building-models", + "permanent": true + }, + { + "source": "/docs/building-packages", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/centos", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/clean", + "destination": "/reference/commands/clean", + "permanent": true + }, + { + "source": "/docs/cloud-choosing-a-dbt-version", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "permanent": true + }, + { + "source": "/docs/cloud-configuring-dbt-cloud", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/cloud-enabling-continuous-integration-with-github", + "destination": "/docs/deploy/cloud-ci-job", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration", + "permanent": true + }, + { + "source": "/docs/cloud-generating-documentation", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-generating-documentation", + "permanent": true + }, + { + "source": "/docs/cloud-import-a-project-by-git-url", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/docs/cloud-installing-the-github-application", + "destination": "/docs/cloud/git/connect-github", + "permanent": true + }, + { + "source": "/docs/cloud-managing-permissions", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-managing-permissions", + "permanent": true + }, + { + "source": "/docs/cloud-overview", + "destination": "/docs/dbt-cloud/cloud-overview", + "permanent": true + }, + { + "source": "/docs/cloud-seats-and-users", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-seats-and-users", + "permanent": true + }, + { + "source": "/docs/cloud-setting-a-custom-target-name", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-setting-a-custom-target-name", + "permanent": true + }, + { + "source": "/docs/cloud-snapshotting-source-freshness", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness", + "permanent": true + }, + { + "source": "/docs/cloud-supported-dbt-versions", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "permanent": true + }, + { + "source": "/docs/cloud-using-a-custom-cron-schedule", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-using-a-custom-cron-schedule", + "permanent": true + }, + { + "source": "/docs/cloud-using-a-managed-repository", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository", + "permanent": true + }, + { + "source": "/docs/cmd-docs", + "destination": "/reference/commands/cmd-docs", + "permanent": true + }, + { + "source": "/docs/command-line-interface", + "destination": "/reference/dbt-commands", + "permanent": true + }, + { + "source": "/docs/compile", + "destination": "/reference/commands/compile", + "permanent": true + }, + { + "source": "/docs/config", + "destination": "/docs/writing-code-in-dbt/jinja-context/config", + "permanent": true + }, + { + "source": "/docs/configure-your-profile", + "destination": "/dbt-cli/configure-your-profile", + "permanent": true + }, + { + "source": "/docs/configuring-incremental-models", + "destination": "/docs/building-a-dbt-project/building-models/configuring-incremental-models", + "permanent": true + }, + { + "source": "/docs/configuring-models", + "destination": "/reference/model-configs", + "permanent": true + }, + { + "source": "/docs/configuring-query-comments", + "destination": "/docs/building-a-dbt-project/dbt-projects/configuring-query-comments", + "permanent": true + }, + { + "source": "/docs/configuring-quoting", + "destination": "/docs/building-a-dbt-project/dbt-projects/configuring-quoting", + "permanent": true + }, + { + "source": "/docs/configuring-resources-from-the-project-file", + "destination": "/docs/building-a-dbt-project/dbt-projects/configuring-resources-from-the-project-file", + "permanent": true + }, + { + "source": "/docs/connecting-your-database", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database", + "permanent": true + }, + { + "source": "/docs/contributor-license-agreements", + "destination": "/docs/contributing/contributor-license-agreements", + "permanent": true + }, + { + "source": "/docs/creating-a-project", + "destination": "/docs/building-a-dbt-project/dbt-projects/creating-a-project", + "permanent": true + }, + { + "source": "/docs/creating-new-materializations", + "destination": "/guides/legacy/creating-new-materializations", + "permanent": true + }, + { + "source": "/docs/creating-date-partitioned-tables", + "destination": "/docs/guides/database-specific-guides/creating-date-partitioned-tables", + "permanent": true + }, + { + "source": "/docs/custom-schema-tests", + "destination": "/guides/legacy/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/docs/database-specific-guides", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/dbt-api", + "destination": "/docs/running-a-dbt-project/dbt-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-enterprise", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-repositories", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-configuring-repositories", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "destination": "/docs/dbt-versions/upgrade-core-in-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permissions", + "destination": "/docs/dbt-cloud/access-control/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/architecture", + "destination": "/dbt-cloud/on-premises/dependencies", + "permanent": true + }, + { + "source": "/docs/dbt-projects", + "destination": "/docs/building-a-dbt-project/dbt-projects", + "permanent": true + }, + { + "source": "/docs/dbt_projectyml-file", + "destination": "/docs/building-a-dbt-project/dbt-projects/dbt_projectyml-file", + "permanent": true + }, + { + "source": "/docs/debug", + "destination": "/reference/commands/debug", + "permanent": true + }, + { + "source": "/docs/debug-method", + "destination": "/docs/writing-code-in-dbt/jinja-context/debug-method", + "permanent": true + }, + { + "source": "/docs/deps", + "destination": "/reference/commands/deps", + "permanent": true + }, + { + "source": "/docs/doc", + "destination": "/docs/writing-code-in-dbt/jinja-context/doc", + "permanent": true + }, + { + "source": "/docs/documentation", + "destination": "/docs/building-a-dbt-project/documentation", + "permanent": true + }, + { + "source": "/docs/documentation-website", + "destination": "/docs/building-a-dbt-project/documentation", + "permanent": true + }, + { + "source": "/docs/dont-nest-your-curlies", + "destination": "/docs/building-a-dbt-project/dont-nest-your-curlies", + "permanent": true + }, + { + "source": "/docs/enable-and-disable-models", + "destination": "/reference/resource-configs/enabled", + "permanent": true + }, + { + "source": "/docs/enterprise-permissions", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/env_var", + "destination": "/docs/writing-code-in-dbt/jinja-context/env_var", + "permanent": true + }, + { + "source": "/docs/exceptions", + "destination": "/docs/writing-code-in-dbt/jinja-context/exceptions", + "permanent": true + }, + { + "source": "/docs/execute", + "destination": "/docs/writing-code-in-dbt/jinja-context/execute", + "permanent": true + }, + { + "source": "/docs/exit-codes", + "destination": "/reference/exit-codes", + "permanent": true + }, + { + "source": "/docs/flags", + "destination": "/docs/writing-code-in-dbt/jinja-context/flags", + "permanent": true + }, + { + "source": "/docs/fromjson", + "destination": "/docs/writing-code-in-dbt/jinja-context/fromjson", + "permanent": true + }, + { + "source": "/docs/getting-started-with-jinja", + "destination": "/docs/building-a-dbt-project/jinja-macros", + "permanent": true + }, + { + "source": "/docs/global-cli-flags", + "destination": "/reference/global-cli-flags", + "permanent": true + }, + { + "source": "/docs/graph", + "destination": "/docs/writing-code-in-dbt/jinja-context/graph", + "permanent": true + }, + { + "source": "/docs/guides/building-packages", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/guides/creating-new-materializations", + "destination": "/guides/legacy/creating-new-materializations", + "permanent": true + }, + { + "source": "/docs/guides/debugging-errors", + "destination": "/guides/legacy/debugging-errors", + "permanent": true + }, + { + "source": "/docs/guides/debugging-schema-names", + "destination": "/guides/legacy/debugging-schema-names", + "permanent": true + }, + { + "source": "/docs/guides/getting-help", + "destination": "/guides/legacy/getting-help", + "permanent": true + }, + { + "source": "/docs/guides/managing-environments", + "destination": "/guides/legacy/managing-environments", + "permanent": true + }, + { + "source": "/docs/guides/navigating-the-docs", + "destination": "/guides/legacy/navigating-the-docs", + "permanent": true + }, + { + "source": "/docs/guides/understanding-state", + "destination": "/guides/legacy/understanding-state", + "permanent": true + }, + { + "source": "/docs/guides/videos", + "destination": "/guides/legacy/videos", + "permanent": true + }, + { + "source": "/docs/guides/writing-custom-generic-tests", + "destination": "/guides/legacy/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/docs/guides/writing-custom-schema-tests", + "destination": "/guides/legacy/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/docs/guides/best-practices#choose-your-materializations-wisely", + "destination": "/guides/legacy/best-practices#choose-your-materializations-wisely", + "permanent": true + }, + { + "source": "/docs/guides/best-practices#version-control-your-dbt-project", + "destination": "/guides/legacy/best-practices#version-control-your-dbt-project", + "permanent": true + }, + { + "source": "/docs/best-practices", + "destination": "/guides/legacy/best-practices", + "permanent": true + }, + { + "source": "/docs/guides/best-practices", + "destination": "/guides/best-practices", + "permanent": true + }, + { + "source": "/docs/hooks", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/init", + "destination": "/reference/commands/init", + "permanent": true + }, + { + "source": "/docs/install-from-source", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/installation", + "destination": "/docs/core/installation", + "permanent": true + }, + { + "source": "/docs/invocation_id", + "destination": "/docs/writing-code-in-dbt/jinja-context/invocation_id", + "permanent": true + }, + { + "source": "/docs/jinja-context", + "destination": "/docs/writing-code-in-dbt/jinja-context", + "permanent": true + }, + { + "source": "/docs/license", + "destination": "/docs/about/license", + "permanent": true + }, + { + "source": "/docs/list", + "destination": "/reference/commands/list", + "permanent": true + }, + { + "source": "/docs/log", + "destination": "/docs/writing-code-in-dbt/jinja-context/log", + "permanent": true + }, + { + "source": "/docs/macos", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/macros", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/maintaining-multiple-environments-with-dbt", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/managing-environments", + "destination": "/guides/legacy/managing-environments", + "permanent": true + }, + { + "source": "/docs/materializations", + "destination": "/docs/building-a-dbt-project/building-models/materializations", + "permanent": true + }, + { + "source": "/docs/model-selection-syntax", + "destination": "/reference/node-selection/syntax", + "permanent": true + }, + { + "source": "/docs/modules", + "destination": "/docs/writing-code-in-dbt/jinja-context/modules", + "permanent": true + }, + { + "source": "/docs/on-run-end-context", + "destination": "/docs/writing-code-in-dbt/jinja-context/on-run-end-context", + "permanent": true + }, + { + "source": "/docs/overview", + "destination": "/docs/introduction", + "permanent": true + }, + { + "source": "/docs/performance-optimization", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/package-management", + "destination": "/docs/building-a-dbt-project/package-management", + "permanent": true + }, + { + "source": "/docs/profile-bigquery", + "destination": "/reference/warehouse-profiles/bigquery-profile", + "permanent": true + }, + { + "source": "/docs/profile-mssql", + "destination": "/reference/warehouse-profiles/mssql-profile", + "permanent": true + }, + { + "source": "/docs/profile-postgres", + "destination": "/reference/warehouse-profiles/postgres-profile", + "permanent": true + }, + { + "source": "/docs/profile-presto", + "destination": "/reference/warehouse-profiles/presto-profile", + "permanent": true + }, + { + "source": "/docs/profile-redshift", + "destination": "/reference/warehouse-profiles/redshift-profile", + "permanent": true + }, + { + "source": "/docs/profile-snowflake", + "destination": "/reference/warehouse-profiles/snowflake-profile", + "permanent": true + }, + { + "source": "/docs/profile-spark", + "destination": "/reference/warehouse-profiles/spark-profile", + "permanent": true + }, + { + "source": "/docs/redshift-configs", + "destination": "/reference/resource-configs/redshift-configs", + "permanent": true + }, + { + "source": "/docs/spark-configs", + "destination": "/reference/resource-configs/spark-configs", + "permanent": true + }, + { + "source": "/docs/redshift-v2", + "destination": "/reference/warehouse-profiles/redshift-profile", + "permanent": true + }, + { + "source": "/docs/ref", + "destination": "/docs/writing-code-in-dbt/jinja-context/ref", + "permanent": true + }, + { + "source": "/docs/requiring-specific-dbt-versions", + "destination": "/docs/building-a-dbt-project/dbt-projects/requiring-specific-dbt-versions", + "permanent": true + }, + { + "source": "/docs/requiring-dbt-versions", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/return", + "destination": "/docs/writing-code-in-dbt/jinja-context/return", + "permanent": true + }, + { + "source": "/docs/rpc", + "destination": "/reference/commands/rpc", + "permanent": true + }, + { + "source": "/docs/run", + "destination": "/reference/commands/run", + "permanent": true + }, + { + "source": "/docs/run-operation", + "destination": "/reference/commands/run-operation", + "permanent": true + }, + { + "source": "/docs/run_query", + "destination": "/docs/writing-code-in-dbt/jinja-context/run_query", + "permanent": true + }, + { + "source": "/docs/run_started_at", + "destination": "/docs/writing-code-in-dbt/jinja-context/run_started_at", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface", + "destination": "/reference/dbt-commands", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/clean", + "destination": "/reference/commands/clean", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/cmd-docs", + "destination": "/reference/commands/cmd-docs", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/compile", + "destination": "/reference/commands/compile", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/debug", + "destination": "/reference/commands/debug", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/deps", + "destination": "/reference/commands/deps", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/exit-codes", + "destination": "/reference/exit-codes", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/global-cli-flags", + "destination": "/reference/global-cli-flags", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/init", + "destination": "/reference/commands/init", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/list", + "destination": "/reference/commands/list", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/model-selection-syntax", + "destination": "/reference/model-selection-syntax", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/rpc", + "destination": "/reference/commands/rpc", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/run", + "destination": "/reference/commands/run", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/run-operation", + "destination": "/reference/commands/run-operation", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/seed", + "destination": "/reference/commands/seed", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/snapshot", + "destination": "/reference/commands/snapshot", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/source", + "destination": "/reference/commands/source", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/test", + "destination": "/reference/commands/test", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/version", + "destination": "/reference/global-cli-flags#version", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface", + "destination": "/docs/running-a-dbt-project/using-the-cli", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/centos", + "destination": "/dbt-cli/installation-guides/centos", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/configure-your-profile", + "destination": "/dbt-cli/configure-your-profile", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/install-from-source", + "destination": "/dbt-cli/installation-guides/install-from-source", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/installation", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/macos", + "destination": "/dbt-cli/installation-guides/macos", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/ubuntu-debian", + "destination": "/dbt-cli/installation-guides/ubuntu-debian", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/windows", + "destination": "/dbt-cli/installation-guides/windows", + "permanent": true + }, + { + "source": "/docs/running-dbt-in-production", + "destination": "/docs/running-a-dbt-project/running-dbt-in-production", + "permanent": true + }, + { + "source": "/docs/schema", + "destination": "/docs/writing-code-in-dbt/jinja-context/schema", + "permanent": true + }, + { + "source": "/docs/schemas", + "destination": "/docs/writing-code-in-dbt/jinja-context/schemas", + "permanent": true + }, + { + "source": "/docs/schemayml-files", + "destination": "/reference/declaring-properties", + "permanent": true + }, + { + "source": "/docs/seed", + "destination": "/reference/commands/seed", + "permanent": true + }, + { + "source": "/docs/seeds", + "destination": "/docs/building-a-dbt-project/seeds", + "permanent": true + }, + { + "source": "/docs/setting-up-enterprise-sso-with-azure-active-directory", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-sso-with-azure-active-directory", + "permanent": true + }, + { + "source": "/docs/setting-up-snowflake-sso", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-snowflake-oauth", + "permanent": true + }, + { + "source": "/docs/setting-up-sso-with-google-gsuite", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-google-gsuite", + "permanent": true + }, + { + "source": "/docs/setting-up-sso-with-okta", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-okta", + "permanent": true + }, + { + "source": "/docs/snapshot", + "destination": "/reference/commands/snapshot", + "permanent": true + }, + { + "source": "/docs/snapshots", + "destination": "/docs/building-a-dbt-project/snapshots", + "permanent": true + }, + { + "source": "/docs/snowflake-configs", + "destination": "/reference/resource-configs/snowflake-configs", + "permanent": true + }, + { + "source": "/docs/source", + "destination": "/reference/commands/source", + "permanent": true + }, + { + "source": "/docs/statement-blocks", + "destination": "/docs/writing-code-in-dbt/jinja-context/statement-blocks", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-bigquery", + "destination": "/reference/bigquery-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-mssql", + "destination": "/reference/mssql-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-postgres", + "destination": "/reference/postgres-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-presto", + "destination": "/reference/presto-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-redshift", + "destination": "/reference/redshift-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-snowflake", + "destination": "/reference/snowflake-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-spark", + "destination": "/reference/spark-profile", + "permanent": true + }, + { + "source": "/docs/tags", + "destination": "/reference/resource-configs/tags", + "permanent": true + }, + { + "source": "/docs/target", + "destination": "/docs/writing-code-in-dbt/jinja-context/target", + "permanent": true + }, + { + "source": "/docs/test", + "destination": "/reference/commands/test", + "permanent": true + }, + { + "source": "/docs/testing", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/testing-and-documentation", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/the-dbt-ide", + "destination": "/docs/cloud/about-cloud/dbt-cloud-features", + "permanent": true + }, + { + "source": "/docs/this", + "destination": "/docs/writing-code-in-dbt/jinja-context/this", + "permanent": true + }, + { + "source": "/docs/tojson", + "destination": "/docs/writing-code-in-dbt/jinja-context/tojson", + "permanent": true + }, + { + "source": "/docs/ubuntu-debian", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/use-an-existing-project", + "destination": "/docs/building-a-dbt-project/dbt-projects/use-an-existing-project", + "permanent": true + }, + { + "source": "/docs/using-custom-aliases", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-aliases", + "permanent": true + }, + { + "source": "/docs/using-custom-database", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-databases", + "permanent": true + }, + { + "source": "/docs/using-custom-schemas", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-schemas", + "permanent": true + }, + { + "source": "/docs/using-dbt-cloud", + "destination": "/docs/dbt-cloud/using-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/using-jinja", + "destination": "/guides/getting-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/docs/using-operations", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/using-sources", + "destination": "/docs/building-a-dbt-project/using-sources", + "permanent": true + }, + { + "source": "/docs/using-sql-headers", + "destination": "/reference/resource-configs/sql_header", + "permanent": true + }, + { + "source": "/docs/using-the-command-line-interface", + "destination": "/docs/running-a-dbt-project/using-the-cli", + "permanent": true + }, + { + "source": "/docs/using-the-dbt-ide", + "destination": "/docs/running-a-dbt-project/using-the-dbt-ide", + "permanent": true + }, + { + "source": "/docs/using-variables", + "destination": "/docs/building-a-dbt-project/building-models/using-variables", + "permanent": true + }, + { + "source": "/docs/var", + "destination": "/docs/writing-code-in-dbt/jinja-context/var", + "permanent": true + }, + { + "source": "/docs/version", + "destination": "/reference/global-cli-flags#version", + "permanent": true + }, + { + "source": "/docs/videos", + "destination": "/guides/legacy/videos", + "permanent": true + }, + { + "source": "/docs/warehouse-specific-configurations", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/windows", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/api-variable", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/class-reference", + "destination": "/reference/dbt-classes", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/extending-dbts-programming-environment/creating-new-materializations", + "destination": "/guides/legacy/creating-new-materializations", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/extending-dbts-programming-environment/custom-schema-tests", + "destination": "/guides/legacy/writing-custom-schema-tests", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/getting-started-with-jinja", + "destination": "/docs/building-a-dbt-project/jinja-macros", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/adapter", + "destination": "/reference/dbt-jinja-functions/adapter", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/as_text", + "destination": "/reference/dbt-jinja-functions/as_text", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/builtins", + "destination": "/reference/dbt-jinja-functions/builtins", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/config", + "destination": "/reference/dbt-jinja-functions/config", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/dbt-project-yml-context", + "destination": "/reference/dbt-jinja-functions/dbt-project-yml-context", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/dbt_version", + "destination": "/reference/dbt-jinja-functions/dbt_version", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/debug-method", + "destination": "/reference/dbt-jinja-functions/debug-method", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/doc", + "destination": "/reference/dbt-jinja-functions/doc", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/env_var", + "destination": "/reference/dbt-jinja-functions/env_var", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/exceptions", + "destination": "/reference/dbt-jinja-functions/exceptions", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/execute", + "destination": "/reference/dbt-jinja-functions/execute", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/flags", + "destination": "/reference/dbt-jinja-functions/flags", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/fromjson", + "destination": "/reference/dbt-jinja-functions/fromjson", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/fromyaml", + "destination": "/reference/dbt-jinja-functions/fromyaml", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/graph", + "destination": "/reference/dbt-jinja-functions/graph", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/invocation_id", + "destination": "/reference/dbt-jinja-functions/invocation_id", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/log", + "destination": "/reference/dbt-jinja-functions/log", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/modules", + "destination": "/reference/dbt-jinja-functions/modules", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/on-run-end-context", + "destination": "/reference/dbt-jinja-functions/on-run-end-context", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/profiles-yml-context", + "destination": "/reference/dbt-jinja-functions/profiles-yml-context", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/project_name", + "destination": "/reference/dbt-jinja-functions/project_name", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/ref", + "destination": "/reference/dbt-jinja-functions/ref", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/return", + "destination": "/reference/dbt-jinja-functions/return", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/run_query", + "destination": "/reference/dbt-jinja-functions/run_query", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/run_started_at", + "destination": "/reference/dbt-jinja-functions/run_started_at", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/schema", + "destination": "/reference/dbt-jinja-functions/schema", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/schemas", + "destination": "/reference/dbt-jinja-functions/schemas", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/source", + "destination": "/reference/dbt-jinja-functions/source", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/statement-blocks", + "destination": "/reference/dbt-jinja-functions/statement-blocks", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/target", + "destination": "/reference/dbt-jinja-functions/target", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/this", + "destination": "/reference/dbt-jinja-functions/this", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/tojson", + "destination": "/reference/dbt-jinja-functions/tojson", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/toyaml", + "destination": "/reference/dbt-jinja-functions/toyaml", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/var", + "destination": "/reference/dbt-jinja-functions/var", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/macros", + "destination": "/docs/building-a-dbt-project/jinja-macros", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/using-jinja", + "destination": "/guides/getting-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/faqs/getting-help", + "destination": "/guides/legacy/getting-help", + "permanent": true + }, + { + "source": "/migration-guide/upgrading-to-0-17-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/migration-guide/upgrading-to-0-18-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/reference/accounts", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/api", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/bigquery-profile", + "destination": "/reference/warehouse-profile/bigquery-profile", + "permanent": true + }, + { + "source": "/reference/connections", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/data-test-configs", + "destination": "/reference/test-configs", + "permanent": true + }, + { + "source": "/reference/declaring-properties", + "destination": "/reference/configs-and-properties", + "permanent": true + }, + { + "source": "/reference/dbt-artifacts", + "destination": "/reference/artifacts/dbt-artifacts", + "permanent": true + }, + { + "source": "/reference/environments", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/events", + "destination": "/reference/events-logging", + "permanent": true + }, + { + "source": "/reference/jobs", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/model-selection-syntax", + "destination": "/reference/node-selection/syntax", + "permanent": true + }, + { + "source": "/reference/project-configs/on-run-end", + "destination": "/reference/project-configs/on-run-start-on-run-end", + "permanent": true + }, + { + "source": "/reference/project-configs/on-run-start", + "destination": "/reference/project-configs/on-run-start-on-run-end", + "permanent": true + }, + { + "source": "/reference/repositories", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/resource-configs/post-hook", + "destination": "/reference/resource-configs/pre-hook-post-hook", + "permanent": true + }, + { + "source": "/reference/resource-configs/pre-hook", + "destination": "/reference/resource-configs/pre-hook-post-hook", + "permanent": true + }, + { + "source": "/reference/resource-properties/tags", + "destination": "/reference/resource-configs/tags", + "permanent": true + }, + { + "source": "/reference/resource-properties/meta", + "destination": "/reference/resource-configs/meta", + "permanent": true + }, + { + "source": "/reference/runs", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/using-the-dbt-cloud-api", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/model-selection-syntax/#test-selection-examples", + "destination": "/reference/node-selection/test-selection-examples", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-database", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-databases", + "permanent": true + }, + { + "source": "/dbt-cloud/api", + "destination": "/dbt-cloud/api-v2", + "permanent": true + }, + { + "source": "/dbt-cloud/api-v2-old", + "destination": "/dbt-cloud/api-v2-legacy", + "permanent": true + }, + { + "source": "/dbt-cloud/api-v4", + "destination": "/docs/dbt-cloud-apis/admin-cloud-api", + "permanent": true + }, + { + "source": "/reference/project-configs/source-paths", + "destination": "/reference/project-configs/model-paths", + "permanent": true + }, + { + "source": "/reference/project-configs/data-paths", + "destination": "/reference/project-configs/seed-paths", + "permanent": true + }, + { + "source": "/reference/project-configs/modules-paths", + "destination": "/reference/project-configs/packages-install-path", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-notifications", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/presto-profile", + "destination": "/reference/profiles.yml", + "permanent": true + }, + { + "source": "/setting-up", + "destination": "/guides/getting-started/getting-set-up/setting-up-bigquery", + "permanent": true + }, + { + "source": "/tutorial/setting-up", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/tutorial/test-and-document-your-project", + "destination": "/guides/getting-started/building-your-first-project/test-and-document-your-project", + "permanent": true + }, + { + "source": "/tutorial/build-your-first-models", + "destination": "/guides/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/tutorial/deploy-your-project", + "destination": "/guides/getting-started/building-your-first-project/schedule-a-job", + "permanent": true + }, + { + "source": "/tutorial/using-jinja", + "destination": "/guides/getting-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/tutorial/2b-create-a-project-dbt-cli", + "destination": "/guides/getting-started/learning-more/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/tutorial/create-a-project-dbt-cli", + "destination": "/guides/getting-started/learning-more/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/tutorial/2a-create-a-project-dbt-cloud", + "destination": "/guides/getting-started", + "permanent": true + }, + { + "source": "/tutorial/create-a-project-dbt-cloud", + "destination": "/guides/getting-started", + "permanent": true + }, + { + "source": "/tutorial/getting-started", + "destination": "/guides/getting-started", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-changelog", + "destination": "/docs/dbt-cloud/release-notes", + "permanent": true + }, + { + "source": "/faqs/all", + "destination": "/docs/faqs", + "permanent": true + }, + { + "source": "/faqs/:slug", + "destination": "/docs/faqs/:slug*", + "permanent": true + }, + { + "source": "/faqs/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/tutorial/learning-more/:slug", + "destination": "/guides/getting-started/learning-more/:slug*", + "permanent": true + }, + { + "source": "/tutorial/getting-set-up/:slug", + "destination": "/guides/getting-started/getting-set-up/:slug*", + "permanent": true + }, + { + "source": "/tutorial/building-your-first-project/:slug", + "destination": "/guides/getting-started/building-your-first-project/:slug*", + "permanent": true + }, + { + "source": "/tutorial/refactoring-legacy-sql", + "destination": "/guides/migration/tools/refactoring-legacy-sql", + "permanent": true + }, + { + "source": "/blog/change-data-capture-metrics", + "destination": "/blog/change-data-capture", + "permanent": true + }, + { + "source": "/blog/intelligent-slim-ci", + "destination": "/docs/deploy/continuous-integration", + "permanent": true + }, + { + "source": "/blog/model-timing-tab", + "destination": "/blog/how-we-shaved-90-minutes-off-model", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/resource-configs/materialize-configs/indexes", + "destination": "/reference/resource-configs/materialize-configs#indexes", + "permanent": true + }, + { + "source": "/docs/build/building-models", + "destination": "/docs/build/models", + "permanent": true + }, + { + "source": "/docs/build/bigquery-profile", + "destination": "/reference/resource-configs/bigquery-configs", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/bigquery-setup", + "destination": "/reference/warehouse-setups/bigquery-setup", + "permanent": true + }, + { + "source": "/date-trunc-sql", + "destination": "/blog/date-trunc-sql", + "permanent": true + }, + { + "source": "/docs/using-hooks", + "destination": "/", + "permanent": true + }, + { + "source": "/blog/how-we-structure-our-dbt-projects", + "destination": "/guides/best-practices/how-we-structure/1-guide-overview", + "permanent": true + }, + { + "source": "/data-testing-why-you-need-it-and-how-to-get-started", + "destination": "https://www.getdbt.com/blog/data-quality-testing/", + "permanent": true + }, + { + "source": "/docs/profile", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/available-adapters", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-databases", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-14-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-15-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-16-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-17-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-18-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-19-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-from-0-10-to-0-11", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-014", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/upgrading-to-014", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/upgrading-to-0-14-1", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/upgrading-to-0-16-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-20-0", + "destination": "/guides/migration/versions/upgrading-to-v0.20", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-21-0", + "destination": "/guides/migration/versions/upgrading-to-v0.21", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-1-0-0", + "destination": "/guides/migration/versions/upgrading-to-v1.0", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-v1.0", + "destination": "/guides/migration/versions/upgrading-to-v1.0", + "permanent": true + }, + { + "source": "/docs/guides/getting-help", + "destination": "/guides/legacy/getting-help", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/:slug", + "destination": "/guides/migration/versions/:slug*", + "permanent": true + }, + { + "source": "/docs/guides/:slug", + "destination": "/guides/legacy/:slug*", + "permanent": true + }, + { + "source": "/guides/best-practices/environment-setup/1-env-guide-overview", + "destination": "/guides/orchestration/set-up-ci/overview", + "permanent": true + }, + { + "source": "/guides/best-practices/environment-setup/2-one-deployment-environment", + "destination": "/guides/orchestration/set-up-ci/in-15-minutes", + "permanent": true + }, + { + "source": "/guides/best-practices/environment-setup/3-many-deployment-environments", + "destination": "/guides/orchestration/set-up-ci/multiple-environments", + "permanent": true + }, + { + "source": "/docs/contributing/what-are-adapters", + "destination": "/guides/advanced/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/1-what-are-adapters", + "destination": "/guides/advanced/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/docs/contributing/prerequisites-for-a-new-adapter", + "destination": "/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/2-prerequisites-for-a-new-adapter", + "destination": "/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/building-a-new-adapter", + "destination": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/3-building-a-new-adapter", + "destination": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/v0.13/docs/building-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/building-a-new-adapter", + "destination": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/testing-a-new-adapter", + "destination": "/guides/advanced/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/4-testing-a-new-adapter", + "destination": "/guides/advanced/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/documenting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/5-documenting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/promoting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/6-promoting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/verifying-a-new-adapter", + "destination": "/guides/advanced/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/7-verifying-a-new-adapter", + "destination": "/guides/advanced/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-metrics-layer", + "destination": "/docs/use-dbt-semantic-layer/dbt-semantic-layer", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/impala-profile", + "destination": "/reference/warehouse-setups/impala-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/exasol-profile", + "destination": "/reference/warehouse-setups/exasol-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/layer-profile", + "destination": "/reference/warehouse-setups/layer-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/postgres-profile", + "destination": "/reference/warehouse-setups/postgres-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/greenplum-profile", + "destination": "/reference/warehouse-setups/greenplum-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/alloydb-profile", + "destination": "/reference/warehouse-setups/alloydb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/azuresynapse-profile", + "destination": "/reference/warehouse-setups/azuresynapse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/snowflake-profile", + "destination": "/reference/warehouse-setups/snowflake-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/rockset-profile", + "destination": "/reference/warehouse-setups/rockset-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/trino-profile", + "destination": "/reference/warehouse-setups/trino-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/glue-profile", + "destination": "/reference/warehouse-setups/glue-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/duckdb-profile", + "destination": "/reference/warehouse-setups/duckdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/vertica-profile", + "destination": "/reference/warehouse-setups/vertica-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/clickhouse-profile", + "destination": "/reference/warehouse-setups/clickhouse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/athena-profile", + "destination": "/reference/warehouse-setups/athena-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/iomete-profile", + "destination": "/reference/warehouse-setups/iomete-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/mssql-profile", + "destination": "/reference/warehouse-setups/mssql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/tidb-profile", + "destination": "/reference/warehouse-setups/tidb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/materialize-profile", + "destination": "/reference/warehouse-setups/materialize-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/redshift-profile", + "destination": "/reference/warehouse-setups/redshift-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/databricks-profile", + "destination": "/reference/warehouse-setups/databricks-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/bigquery-profile", + "destination": "/reference/warehouse-setups/bigquery-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/dremio-profile", + "destination": "/reference/warehouse-setups/dremio-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/oracle-profile", + "destination": "/reference/warehouse-setups/oracle-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/teradata-profile", + "destination": "/reference/warehouse-setups/teradata-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/singlestore-profile", + "destination": "/reference/warehouse-setups/singlestore-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/sqlite-profile", + "destination": "/reference/warehouse-setups/sqlite-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/spark-profile", + "destination": "/reference/warehouse-setups/spark-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/mindsdb-profile", + "destination": "/reference/warehouse-setups/mindsdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/ibmdb2-profile", + "destination": "/reference/warehouse-setups/ibmdb2-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/firebolt-profile", + "destination": "/reference/warehouse-setups/firebolt-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/mysql-profile", + "destination": "/reference/warehouse-setups/mysql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/hive-profile", + "destination": "/reference/warehouse-setups/hive-setup", + "permanent": true + }, + { + "source": "/reference/using-sources", + "destination": "/docs/build/sources", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/the-dbt-ide", + "destination": "/docs/getting-started/dbt-cloud-features", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/handling-merge-conflicts", + "destination": "/docs/collaborate/git/resolve-merge-conflicts", + "permanent": true + }, + { + "source": "/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", + "destination": "/docs/getting-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/ide-beta", + "destination": "/docs/getting-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-dbt-ide", + "destination": "/docs/getting-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/dbt-cloud/cloud-ide/the-ide-git-button", + "destination": "/docs/collaborate/git/version-control-basics", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/setting-up", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/docs/contributing/long-lived-discussions-guidelines", + "destination": "/community/resources/forum-guidelines", + "permanent": true + }, + { + "source": "/docs/guides/legacy/navigating-the-docs.md", + "destination": "/community/contribute", + "permanent": true + }, + { + "source": "/community/writing-on-discourse", + "destination": "/community/contributing/contributing-online-community", + "permanent": true + }, + { + "source": "/community/contributing", + "destination": "/community/contribute", + "permanent": true + }, + { + "source": "/docs/contributing/contributor-license-agreements", + "destination": "/community/resources/contributor-license-agreements", + "permanent": true + }, + { + "source": "/community/maintaining-a-channel", + "destination": "/community/resources/maintaining-a-channel", + "permanent": true + }, + { + "source": "/docs/contributing/oss-expectations", + "destination": "/community/resources/oss-expectations", + "permanent": true + }, + { + "source": "/docs/slack-rules-of-the-road", + "destination": "/community/resources/community-rules-of-the-road", + "permanent": true + }, + { + "source": "/docs/contributing/slack-rules-of-the-road", + "destination": "/community/resources/community-rules-of-the-road", + "permanent": true + }, + { + "source": "/community/resources/slack-rules-of-the-road", + "destination": "/community/resources/community-rules-of-the-road", + "permanent": true + }, + { + "source": "/blog/getting-started-with-the-dbt-semantic-layer", + "destination": "/blog/understanding-the-components-of-the-dbt-semantic-layer", + "permanent": true + }, + { + "source": "/docs/getting-started/develop-in-the-cloud#creating-a-development-environment", + "destination": "/docs/get-started/develop-in-the-cloud#set-up-and-access-the-cloud-ide", + "permanent": true + }, + { + "source": "/docs/cloud-developer-ide", + "destination": "/docs/build/custom-target-names#dbt-cloud-ide", + "permanent": true + }, + { + "source": "/website/docs/docs/contributing/building-a-new-adapter.md", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/legacy/getting-help", + "destination": "/community/resources/getting-help", + "permanent": true + }, + { + "source": "/blog/tags/release-notes", + "destination": "/docs/dbt-versions/dbt-cloud-release-notes", + "permanent": true + }, + { + "source": "/faqs/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/website/docs/docs/contributing/documenting-a-new-adapter.md", + "destination": "/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/docs/contributing/documenting-a-new-adapter", + "destination": "/docs/contributing/documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/v0.8/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.10/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.12/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.13/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.13/docs/requiring-dbt-versions", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.14/docs/cloud-developer-ide", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.15/docs/cloud-import-a-project-by-git-url", + "destination": "/docs/cloud/git/import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/v0.15/docs/configure-your-profile", + "destination": "/docs/core/connection-profiles", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/dependencies", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/faqs", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/index", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/installation", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/prerequisites", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/setup", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/system-requirements", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/upgrading-kots", + "destination": "/docs/deploy/single-tenant", + "permanent": true + } + ] +}