diff --git a/.github/workflows/preview-link.yml b/.github/workflows/preview-link.yml new file mode 100644 index 00000000000..f128f44b8cd --- /dev/null +++ b/.github/workflows/preview-link.yml @@ -0,0 +1,169 @@ +name: Vercel deployment preview link generator + +on: + pull_request: + types: [opened, synchronize] + paths: + - 'website/docs/docs/**' + - 'website/docs/best-practices/**' + - 'website/docs/guides/**' + - 'website/docs/faqs/**' + - 'website/docs/reference/**' + +permissions: + contents: write + pull-requests: write + +jobs: + update-pr-description: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install necessary tools + run: | + sudo apt-get update + sudo apt-get install -y jq curl + + - name: Generate Vercel deployment URL + id: vercel_url + run: | + # Get the branch name + BRANCH_NAME="${{ github.head_ref }}" + + # Convert to lowercase + BRANCH_NAME_LOWER=$(echo "$BRANCH_NAME" | tr '[:upper:]' '[:lower:]') + + # Replace non-alphanumeric characters with hyphens + BRANCH_NAME_SANITIZED=$(echo "$BRANCH_NAME_LOWER" | sed 's/[^a-z0-9]/-/g') + + # Construct the deployment URL + DEPLOYMENT_URL="https://docs-getdbt-com-git-${BRANCH_NAME_SANITIZED}-dbt-labs.vercel.app" + + echo "deployment_url=$DEPLOYMENT_URL" >> $GITHUB_OUTPUT + + - name: Wait for deployment to be accessible + id: wait_for_deployment + run: | + DEPLOYMENT_URL="${{ steps.vercel_url.outputs.deployment_url }}" + echo "Waiting for deployment at $DEPLOYMENT_URL to become accessible..." + + MAX_ATTEMPTS=60 # Wait up to 10 minutes + SLEEP_TIME=10 # Check every 10 seconds + ATTEMPTS=0 + + while [ $ATTEMPTS -lt $MAX_ATTEMPTS ]; do + STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$DEPLOYMENT_URL") + if [ "$STATUS_CODE" -eq 200 ]; then + echo "Deployment is accessible." + break + else + echo "Deployment not yet accessible (status code: $STATUS_CODE). Waiting..." + sleep $SLEEP_TIME + ATTEMPTS=$((ATTEMPTS + 1)) + fi + done + + if [ $ATTEMPTS -eq $MAX_ATTEMPTS ]; then + echo "Deployment did not become accessible within the expected time." + exit 1 + fi + + - name: Get changed files + id: files + run: | + CHANGED_FILES=$(git diff --name-only ${{ github.event.pull_request.base.sha }} ${{ github.sha }} | grep -E '^website/docs/(docs|best-practices|guides|faqs|reference)/.*\.md$' || true) + if [ -z "$CHANGED_FILES" ]; then + echo "No documentation files were changed." + echo "changed_files=" >> $GITHUB_OUTPUT + else + CHANGED_FILES=$(echo "$CHANGED_FILES" | tr '\n' ' ') + echo "changed_files=$CHANGED_FILES" >> $GITHUB_OUTPUT + fi + + - name: Generate file preview links + id: links + run: | + DEPLOYMENT_URL="${{ steps.vercel_url.outputs.deployment_url }}" + CHANGED_FILES="${{ steps.files.outputs.changed_files }}" + + if [ -z "$CHANGED_FILES" ]; then + echo "No changed files found in the specified directories." + LINKS="No documentation files were changed." + else + LINKS="" + # Convert CHANGED_FILES back to newline-separated for processing + CHANGED_FILES=$(echo "$CHANGED_FILES" | tr ' ' '\n') + for FILE in $CHANGED_FILES; do + # Remove 'website/docs/' prefix + FILE_PATH="${FILE#website/docs/}" + # Remove the .md extension + FILE_PATH="${FILE_PATH%.md}" + + # Construct the full URL + FULL_URL="$DEPLOYMENT_URL/$FILE_PATH" + LINKS="$LINKS\n- $FULL_URL" + done + fi + + # Properly set the multi-line output + echo "links<> $GITHUB_OUTPUT + echo -e "$LINKS" >> $GITHUB_OUTPUT + echo "EOF" >> $GITHUB_OUTPUT + + - name: Update PR description with deployment links + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const prNumber = context.issue.number; + + // Fetch the current PR description + const { data: pullRequest } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber, + }); + + let body = pullRequest.body || ''; + + // Define the markers + const startMarker = ''; + const endMarker = ''; + + // Get the deployment URL and links from environment variables + const deploymentUrl = process.env.DEPLOYMENT_URL; + const links = process.env.LINKS; + + // Build the deployment content without leading whitespace + const deploymentContent = [ + `${startMarker}`, + '---', + '🚀 Deployment available! Here are the direct links to the updated files:', + '', + `${links}`, + '', + `${endMarker}` + ].join('\n'); + + // Remove existing deployment content between markers + const regex = new RegExp(`${startMarker}[\\s\\S]*?${endMarker}`, 'g'); + body = body.replace(regex, '').trim(); + + // Append the new deployment content + body = `${body}\n\n${deploymentContent}`; + + // Update the PR description + await github.rest.pulls.update({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: prNumber, + body: body, + }); + env: + DEPLOYMENT_URL: ${{ steps.vercel_url.outputs.deployment_url }} + LINKS: ${{ steps.links.outputs.links }} diff --git a/.github/workflows/vale.yml b/.github/workflows/vale.yml new file mode 100644 index 00000000000..5feaaa12a20 --- /dev/null +++ b/.github/workflows/vale.yml @@ -0,0 +1,80 @@ +name: Vale linting + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'website/docs/**/*' + - 'website/blog/**/*' + - 'website/**/*' + +jobs: + vale: + name: Vale linting + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + with: + fetch-depth: 1 + + - name: List repository contents + run: | + pwd + ls -R + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install Vale + run: pip install vale==2.27.0 # Install a stable version of Vale + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v34 + with: + files: | + website/**/*.md + separator: ' ' + + - name: Debugging - Print changed files + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + run: | + echo "Changed files:" + echo "${{ steps.changed-files.outputs.all_changed_and_modified_files }}" + + - name: Confirm files exist + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + run: | + echo "Checking if files exist..." + for file in ${{ steps.changed-files.outputs.all_changed_and_modified_files }}; do + if [ -f "$file" ]; then + echo "Found: $file" + else + echo "File not found: $file" + exit 1 + fi + done + + - name: Run vale + if: ${{ steps.changed-files.outputs.any_changed == 'true' }} + uses: errata-ai/vale-action@reviewdog + with: + token: ${{ secrets.GITHUB_TOKEN }} + reporter: github-check + files: ${{ steps.changed-files.outputs.all_changed_and_modified_files }} + separator: ' ' + version: '2.27.0' + +# - name: Post summary comment +# if: ${{ steps.changed-files.outputs.any_changed == 'true' }} +# run: | +# COMMENT="❗️Oh no, some Vale linting found issues! Please check the **Files change** tab for detailed results and make the necessary updates." +# COMMENT+=$'\n' +# COMMENT+=$'\n\n' +# COMMENT+="➡️ Link to detailed report: [Files changed](${{ github.event.pull_request.html_url }}/files)" +# gh pr comment ${{ github.event.pull_request.number }} --body "$COMMENT" +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.hyperlint/config.yaml b/.hyperlint/config.yaml new file mode 100644 index 00000000000..03082114ae1 --- /dev/null +++ b/.hyperlint/config.yaml @@ -0,0 +1,10 @@ +content_dir: /docs +authorized_users: + - mirnawong1 + - matthewshaver + - nghi-ly + - runleonarun + - nataliefiann + +vale: + enabled: true diff --git a/.vale.ini b/.vale.ini new file mode 100644 index 00000000000..58aff923afe --- /dev/null +++ b/.vale.ini @@ -0,0 +1,7 @@ +StylesPath = styles +MinAlertLevel = warning + +Vocab = EN + +[*.md] +BasedOnStyles = custom diff --git a/styles/Vocab/EN/accept.txt b/styles/Vocab/EN/accept.txt new file mode 100644 index 00000000000..e673e2ef83d --- /dev/null +++ b/styles/Vocab/EN/accept.txt @@ -0,0 +1,67 @@ +dbt Cloud +dbt Core +dbt Semantic Layer +dbt Explorer +dbt +dbt-tonic +dbtonic +IDE +CLI +Config +info +docs +yaml +YAML +SQL +bash +shell +MetricFlow +jinja +jinja2 +sqlmesh +Snowflake +Databricks +Fabric +Redshift +Azure +DevOps +Athena +Amazon +UI +CSV +S3 +SCD +repo +dbt_project.yml +boolean +defaultValue= +DWH +DWUs +shoutout +ADF +BQ +gcloud +MSFT +DDL +APIs +API +SSIS +PBI +PowerBI +datetime +PySpark +:::caution +:::note +:::info +:::tip +:::warning +\<[^>]+\> +\b[A-Z]{2,}(?:/[A-Z]{2,})?\b +\w+-\w+ +\w+/\w+ +n/a +N/A +\ diff --git a/styles/custom/LatinAbbreviations.yml b/styles/custom/LatinAbbreviations.yml new file mode 100644 index 00000000000..44a3c9d6e8c --- /dev/null +++ b/styles/custom/LatinAbbreviations.yml @@ -0,0 +1,15 @@ +# LatinAbbreviations.yml +extends: substitution +message: "Avoid Latin abbreviations: '%s'. Consider using '%s' instead." +level: warning + +swap: + 'e.g.': 'for example' + 'e.g': 'for example' + 'eg': 'for example' + 'i.e.': 'that is' + 'i.e': 'that is' + 'etc.': 'and so on' + 'etc': 'and so on' + 'N.B.': 'Note' + 'NB': 'Note' diff --git a/styles/custom/Repitition.yml b/styles/custom/Repitition.yml new file mode 100644 index 00000000000..4cd620146cf --- /dev/null +++ b/styles/custom/Repitition.yml @@ -0,0 +1,6 @@ +extends: repetition +message: "'%s' is repeated!" +level: warning +alpha: true +tokens: + - '[^\s]+' diff --git a/styles/custom/SentenceCaseHeaders.yml b/styles/custom/SentenceCaseHeaders.yml new file mode 100644 index 00000000000..d1d6cd97c67 --- /dev/null +++ b/styles/custom/SentenceCaseHeaders.yml @@ -0,0 +1,34 @@ +extends: capitalization +message: "'%s' should use sentence-style capitalization. Try '%s' instead." +level: warning +scope: heading +match: $sentence # Enforces sentence-style capitalization +indicators: + - ":" +exceptions: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - Snowflake + - Databricks + - Azure + - GCP + - AWS + - SQL + - CLI + - API + - YAML + - JSON + - HTML + - Redshift + - Google + - BigQuery + - SnowSQL + - Snowsight + - Snowpark + - Fabric + - Microsoft + - Postgres + - Explorer + - IDE diff --git a/styles/custom/Typos.yml b/styles/custom/Typos.yml new file mode 100644 index 00000000000..456517950a9 --- /dev/null +++ b/styles/custom/Typos.yml @@ -0,0 +1,39 @@ +extends: spelling + +message: "Oops there's a typo -- did you really mean '%s'? " +level: warning + +action: + name: suggest + params: + - spellings + +custom: true +filters: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - '\bdbt\s+.*?\b' + - '<[^>]+>' # Ignore all HTML-like components starting with < and ending with > + - '<[^>]+>.*<\/[^>]+>' + +--- + +extends: existence + +message: "Ignore specific patterns" +level: skip +tokens: + - '\bdbt\b' + - '\bdbt\s+Cloud\b' + - '\bdbt\s+Core\b' + - '\bdbt\s+Cloud\s+CLI\b' + - '\bdbt\s+.*?\b' + - '<[^>]+>' # Ignore all HTML-like components starting with < and ending with > + - '<[^>]+>.*<\/[^>]+>' + - '\w+-\w+' + - '\w+/\w+' + - '\w+/\w+|\w+-\w+|n/a' + - 'n/a' + - 'N/A' diff --git a/styles/custom/UIElements.yml b/styles/custom/UIElements.yml new file mode 100644 index 00000000000..f78a15af4b4 --- /dev/null +++ b/styles/custom/UIElements.yml @@ -0,0 +1,17 @@ +# styles/custom/BoldUIElements.yml +extends: existence +message: "UI elements like '%s' should be bold." +level: warning +tokens: + # Match UI elements that are not bolded (i.e., not within **), but exclude those starting a sentence or following a list number + - '(? + +## What is Iceberg? + +To have this conversation, we need to start with the same foundational understanding of Iceberg. Apache Iceberg is a high-performance open table format developed for modern data lakes. It was designed for large-scale datasets, and within the project, there are many ways to interact with it. When people talk about Iceberg, it often means multiple components including but not limited to: + +1. Iceberg Table Format - an open-source table format with large-scale data. Tables materialized in iceberg table format are stored on a user’s infrastructure, such as S3 Bucket. +2. Iceberg Data Catalog - an open-source metadata management system that tracks the schema, partition, and versions of Iceberg tables. +3. Iceberg REST Protocol (also called Iceberg REST API) is how engines can support and speak to other Iceberg-compatible catalogs. + +If you have been in the industry, you also know that everything I just wrote above about Iceberg could easily be replaced by `Hive,` `Hudi,` or `Delta.` This is because they were all designed to solve essentially the same problem. Ryan Blue (creator of Iceberg) and Michael Armbrust (creator of Delta Lake) recently sat down for this [fantastic chat](https://vimeo.com/1012543474) and said two points that resonated with me: + +- “We never intended for people to pay attention to this area. It’s something we wanted to fix, but people should be able to not pay attention and just work with their data. Storage systems should just work.” +- “We solve the same challenges with different approaches.” + +At the same time, the industry is converging on Apache Iceberg. [Iceberg has the highest availability of read and write support](https://medium.com/sundeck/2024-lakehouse-format-rundown-7edd75015428). + + + + +Snowflake launched Iceberg support in 2022. Databricks launched Iceberg support via Uniform last year. Microsoft announced Fabric support for Iceberg in September 2024 at Fabric Con. **Customers are demanding interoperability, and vendors are listening**. + +Why does this matter? Standardization of the industry benefits customers. When the industry standardizes - customers have the gift of flexibility. Everyone has a preferred way of working, and with standardization — they can always bring their preferred tools to their organization’s data. + +## Just another implementation detail + +I’m not saying open table formats aren't important. The metadata management and performance make them very meaningful and should be paid attention to. Our users are already excited to use it to create data lakes to save on storage costs, create more abstraction from their computing, etc. + +But when building data models or focusing on delivering business value through analytics, my primary concern is not *how* the data is stored—it's *how* I can leverage it to generate insights and drive decisions. The analytics development lifecycle is hard enough without having to take into every detail. dbt abstracts the underlying platform and lets me focus on writing SQL and orchestrating my transformations. It’s a feature that I don’t need to think about how tables are stored or optimized—I just need to know that when I reference dim_customers or fct_sales, the correct data is there and ready to use. **It should just work.** + +## Sometimes the details do matter + +While table formats are an implementation detail for data transformation — Iceberg can impact dbt developers when the implementation details aren’t seamless. Currently, using Iceberg requires a significant amount of upfront configuration and integration work beyond just creating tables to get started. + +One of the biggest hurdles is managing Iceberg’s metadata layer. This metadata often needs to be synced with external catalogs, which requires careful setup and ongoing maintenance to prevent inconsistencies. Permissions and access controls add another layer of complexity—because multiple engines can access Iceberg tables, you have to ensure that all systems have the correct access to both the data files and the metadata catalog. Currently, setting up integrations between these engines is also far from seamless; while some engines natively support Iceberg, others require brittle workarounds to ensure the metadata is synced correctly. This fragmented landscape means you could land with a web of interconnected components. + +## Fixing it + +**Today, we announced official support for the Iceberg table format in dbt.** By supporting the Iceberg table format, it’s one less thing you have to worry about on your journey to adopting Iceberg. + +With support for Iceberg Table Format, it is now easier to convert your dbt models using proprietary table formats to Iceberg by updating your configuration. After you have set up your external storage for Iceberg and connected it to your platforms, you will be able to jump into your dbt model and update the configuration to look something like this: + + + +It is available on these adapters: + +- Athena +- Databricks +- Snowflake +- Spark +- Starburst/Trino +- Dremio + +As with the beauty of any open-source project, Iceberg support grew organically, so the implementations vary. However, this will change in the coming months as we converge onto one dbt standard. This way, no matter which adapter you jump into, the configuration will always be the same. + +## dbt the Abstraction Layer + +dbt is more than about abstracting away the DDL to create and manage objects. It’s also about ensuring an opinionated approach to managing and optimizing your data. That remains true for our strategy around Iceberg Support. + +In our dbt-snowflake implementation, we have already started to [enforce best practices centered around how to manage the base location](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#base-location) to ensure you don’t create technical debt accidentally, ensuring your Iceberg implementation scales over time. And we aren’t done yet. + +That said, while we can create the models, there is a *lot* of initial work to get to that stage. dbt developers must still consider the implementation, like how their external volume has been set up or where dbt can access the metadata. We have to make this better. + +Given the friction of getting launched on Iceberg, over the coming months, we will enable more capabilities to empower users to adopt Iceberg. It should be easier to read from foreign Iceberg catalogs. It should be easier to mount your volume. It should be easier to manage refreshes. And you should also trust that permissions and governance are consistently enforced. + +And this work doesn’t stop at Iceberg. The framework we are building is also compatible with other table formats, ensuring that whatever table format works for you is supported on dbt. This way — dbt users can also stop caring about table formats. **It’s just another implementation detail.** diff --git a/website/blog/2024-10-05-snowflake-feature-store.md b/website/blog/2024-10-05-snowflake-feature-store.md new file mode 100644 index 00000000000..cf5c55be1b5 --- /dev/null +++ b/website/blog/2024-10-05-snowflake-feature-store.md @@ -0,0 +1,273 @@ +--- +title: "Snowflake feature store and dbt: A bridge between data pipelines and ML" +description: A deep-dive into the workflow steps you can take to build and deploy ML models within a single platform. +slug: snowflake-feature-store +authors: [randy_pettus, luis_leon] +tags: [snowflake ML] +hide_table_of_contents: false +date: 2024-10-08 +is_featured: true +--- + +Flying home into Detroit this past week working on this blog post on a plane and saw for the first time, the newly connected deck of the Gordie Howe International [bridge](https://www.freep.com/story/news/local/michigan/detroit/2024/07/24/gordie-howe-bridge-deck-complete-work-moves-to-next-phase/74528258007/) spanning the Detroit River and connecting the U.S. and Canada. The image stuck out because, in one sense, a feature store is a bridge between the clean, consistent datasets and the machine learning models that rely upon this data. But, more interesting than the bridge itself is the massive process of coordination needed to build it. This construction effort — I think — can teach us more about processes and the need for feature stores in machine learning (ML). + +Think of the manufacturing materials needed as our data and the building of the bridge as the building of our ML models. There are thousands of engineers and construction workers taking materials from all over the world, pulling only the specific pieces needed for each part of the project. However, to make this project truly work at this scale, we need the warehousing and logistics to ensure that each load of concrete rebar and steel meets the standards for quality and safety needed and is available to the right people at the right time — as even a single fault can have catastrophic consequences or cause serious delays in project success. This warehouse and the associated logistics play the role of the feature store, ensuring that data is delivered consistently where and when it is needed to train and run ML models. + + + +## What is a feature? + +A feature is a transformed or enriched data that serves as an input into a machine learning model to make predictions. In machine learning, a data scientist derives features from various data sources to build a model that makes predictions based on historical data. To capture the value from this model, the enterprise must operationalize the data pipeline, ensuring that the features being used in production at inference time match those being used in training and development. + +## What role does dbt play in getting data ready for ML models? + +dbt is the standard for data transformation in the enterprise. Organizations leverage dbt at scale to deliver clean and well-governed datasets wherever and whenever they are needed. Using dbt to manage the data transformation processes to cleanse and prepare datasets used in feature development will ensure consistent datasets of guaranteed data quality — meaning that feature development will be consistent and reliable. + + +## Who is going to use this and what benefits will they see? + +Snowflake and dbt are already a well-established and trusted combination for delivering data excellence across the enterprise. The ability to register dbt pipelines in the Snowflake Feature Store further extends this combination for ML and AI workloads, while fitting naturally into the data engineering and feature pipelines already present in dbt. + + +Some of the key benefits are: + +- **Feature collaboration** — Data scientists, data analysts, data engineers, and machine learning engineers collaborate on features used in machine learning models in both Python and SQL, enabling teams to share and reuse features. As a result, teams can improve the time to value of models while improving the understanding of their components. This is all backed by Snowflake’s role-based access control (RBAC) and governance. +- **Feature consistency** — Teams are assured that features generated for training sets and those served for model inference are consistent. This can especially be a concern for large organizations where multiple versions of the truth might persist. Much like how dbt and Snowflake help enterprises have a single source of data truth, now they can have a single source of truth for features. +- **Feature visibility and use** — The Snowflake Feature Store provides an intuitive SDK to work with ML features and their associated metadata. In addition, users can browse and search for features in the Snowflake UI, providing an easy way to identify features +- **Point-in-time correctness** — Snowflake retrieves point-in-time correct features using ASOF Joins, removing the significant complexity in generating the right feature value for a given time period whether for training or batch prediction retrieval. +- **Integration with data pipelines** — Teams that have already built data pipelines in dbt can continue to use these with the Snowflake Feature Store. No additional migration or feature re-creation is necessary as teams plug into the same pipelines. + +## Why did we integrate/build this with Snowflake? + +How does dbt help with ML workloads today? dbt plays a pivotal role in preparing data for ML models by transforming raw data into a format suitable for feature engineering. It helps orchestrate and automate these transformations, ensuring that data is clean, consistent, and ready for ML applications. The combination of Snowflake’s powerful AI Data Cloud and dbt’s transformation prowess makes it an unbeatable pair for organizations aiming to scale their ML operations efficiently. + +## Making it easier for ML/Data Engineers to both build & deploy ML data & models + +dbt is a perfect tool to promote collaboration between data engineers, ML engineers, and data scientists. dbt is designed to support collaboration and quality of data pipelines through features including version control, environments and development life cycles, as well as built-in data and pipeline testing. Leveraging dbt means that data engineers and data scientists can collaborate and develop new models and features while maintaining the rigorous governance and high quality that's needed. + +Additionally, dbt Mesh makes maintaining domain ownership extremely easy by breaking up portions of our data projects and pipelines into connected projects where critical models can be published for consumption by others with strict data contracts enforcing quality and governance. This paradigm supports rapid development as each project can be kept to a maintainable size for its contributors and developers. Contracting on published models used between these projects ensures the consistency of the integration points between them. + +Finally, dbt Cloud also provides [dbt Explorer](/docs/collaborate/explore-projects) — a perfect tool to catalog and share knowledge about organizational data across disparate teams. dbt Explorer provides a central place for information on data pipelines, including lineage information, data freshness, and quality. Best of all, dbt Explorer updates every time dbt jobs run, ensuring this information is always up-to-date and relevant. + +## What tech is at play? + +Here’s what you need from dbt. dbt should be used to manage data transformation pipelines and generate the datasets needed by ML engineers and data scientists maintaining the Snowflake Feature Store. dbt Cloud Enterprise users should leverage dbt Mesh to create different projects with clear owners for these different domains of data pipelines. This Mesh design will promote easier collaboration by keeping each dbt project smaller and more manageable for the people building and maintaining it. dbt also supports both SQL and Python-based transformations making it an ideal fit for AI/ML workflows, which commonly leverage both languages. + +Using dbt for the data transformation pipelines will also ensure the quality and consistency of data products, which is critical for ensuring successful AI/ML efforts. + +## Snowflake ML overview + +The Feature Store is one component of [Snowflake ML’s](https://www.snowflake.com/en/data-cloud/snowflake-ml/) integrated suite of machine learning features that powers end-to-end machine learning within a single platform. Data scientists and ML engineers leverage ready-to-use ML functions or build custom ML workflows all without any data movement or without sacrificing governance. Snowflake ML includes scalable feature engineering and model training capabilities. Meanwhile, the Feature Store and Model Registry allow teams to store and use features and models in production, providing an end-to-end suite for operating ML workloads at scale. + + +## What do you need to do to make it all work? + +dbt Cloud offers the fastest and easiest way to run dbt. It offers a Cloud-based IDE, Cloud-attached CLI, and even a low-code visual editor option (currently in beta), meaning it’s perfect for connecting users across different teams with different workflows and tooling preferences, which is very common in AI/ML workflows. This is the tool you will use to prepare and manage data for AI/ML, promote collaboration across the different teams needed for a successful AI/ML workflow, and ensure the quality and consistency of the underlying data that will be used to create features and train models. + +Organizations interested in AI/ML workflows through Snowflake should also look at the new dbt Snowflake Native App — a Snowflake Native Application that extends the functionality of dbt Cloud into Snowflake. Of particular interest is Ask dbt — a chatbot that integrates directly with Snowflake Cortex and the dbt Semantic Layer to allow natural language questions of Snowflake data. + + +## How to power ML pipelines with dbt and Snowflake’s Feature Store + +Let’s provide a brief example of what this workflow looks like in dbt and Snowflake to build and use the powerful capabilities of a Feature Store. For this example, consider that we have a data pipeline in dbt to process customer transaction data. Various data science teams in the organization need to derive features from these transactions to use in various models, including to predict fraud and perform customer segmentation and personalization. These different use cases all benefit from having related features, such as the count of transactions or purchased amounts over different periods of time (for example, the last day, 7 days, or 30 days) for a given customer. + +Instead of the data scientists building out their own workflows to derive these features, let’s look at the flow of using dbt to manage the feature pipeline and Snowflake’s Feature Store to solve this problem. The following subsections describe the workflow step by step. + +### Create feature tables as dbt models + +The first step consists of building out a feature table as a dbt model. Data scientists and data engineers plug in to existing dbt pipelines and derive a table that includes the underlying entity (for example, customer id, timestamp and feature values). The feature table aggregates the needed features at the appropriate timestamp for a given entity. Note that Snowflake provides various common feature and query patterns available [here](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/examples). So, in our example, we would see a given customer, timestamp, and features representing transaction counts and sums over various periods. Data scientists can use SQL or Python directly in dbt to build this table, which will push down the logic into Snowflake, allowing data scientists to use their existing skill set. + +Window aggregations play an important role in the creation of features. Because the logic for these aggregations is often complex, let’s see how Snowflake and dbt make this process easier by leveraging Don’t Repeat Yourself (DRY) principles. We’ll create a macro that will allow us to use Snowflake’s `range between` syntax in a repeatable way: + +```sql +{% macro rolling_agg(column, partition_by, order_by, interval='30 days', agg_function='sum') %} + {{ agg_function }}({{ column }}) over ( + partition by {{ partition_by }} + order by {{ order_by }} + range between interval '{{ interval }}' preceding and current row + ) +{% endmacro %} + +``` + +Now, we use this macro in our feature table to build out various aggregations of customer transactions over the last day, 7 days, and 30 days. Snowflake has just taken significant complexity away in generating appropriate feature values and dbt has just made the code even more readable and repeatable. While the following example is built in SQL, teams can also build these pipelines using Python directly. + +```sql + +select + tx_datetime, + customer_id, + tx_amount, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "1 days", "sum") }} + as tx_amount_1d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "7 days", "sum") }} + as tx_amount_7d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "30 days", "sum") }} + as tx_amount_30d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "1 days", "avg") }} + as tx_amount_avg_1d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "7 days", "avg") }} + as tx_amount_avg_7d, + {{ rolling_agg("TX_AMOUNT", "CUSTOMER_ID", "TX_DATETIME", "30 days", "avg") }} + as tx_amount_avg_30d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "1 days", "count") }} + as tx_cnt_1d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "7 days", "count") }} + as tx_cnt_7d, + {{ rolling_agg("*", "CUSTOMER_ID", "TX_DATETIME", "30 days", "count") }} + as tx_cnt_30d +from {{ ref("stg_transactions") }} + +``` + +### Create or connect to a Snowflake Feature Store + +Once a feature table is built in dbt, data scientists use Snowflake’s [snowflake-ml-python](https://docs.snowflake.com/en/developer-guide/snowflake-ml/snowpark-ml) package to create or connect to an existing Feature Store in Snowflake. Data scientists can do this all in Python, including in Jupyter Notebooks or directly in Snowflake using [Snowflake Notebooks](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks). + +Let’s go ahead and create the Feature Store in Snowflake: + + +```sql +from snowflake.ml.feature_store import ( + FeatureStore, + FeatureView, + Entity, + CreationMode +) + +fs = FeatureStore( + session=session, + database=fs_db, + name=fs_schema, + default_warehouse='WH_DBT', + creation_mode=CreationMode.CREATE_IF_NOT_EXIST, +) + +``` + +### Create and register feature entities + +The next step consists of creating and registering [entities](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/entities). These represent the underlying objects that features are associated with, forming the join keys used for feature lookups. In our example, the data scientist can register various entities, including for the customer, a transaction id, or other necessary attributes. + +Let’s create some example entities. + +```python +customer = Entity(name="CUSTOMER", join_keys=["CUSTOMER_ID"]) +transaction = Entity(name="TRANSACTION", join_keys=["TRANSACTION_ID"]) +fs.register_entity(customer) +fs.register_entity(transaction) + +``` + +### Register feature tables as feature views + +After registering entities, the next step is to register a [feature view](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/feature-views). This represents a group of related features that stem from the features tables created in the dbt model. In this case, note that the feature logic, refresh, and consistency is managed by the dbt pipeline. The feature view in Snowflake enables versioning of the features while providing discoverability among teams. + +```python +# Create a dataframe from our feature table produced in dbt +customers_transactions_df = session.sql(f""" + SELECT + CUSTOMER_ID, + TX_DATETIME, + TX_AMOUNT_1D, + TX_AMOUNT_7D, + TX_AMOUNT_30D, + TX_AMOUNT_AVG_1D, + TX_AMOUNT_AVG_7D, + TX_AMOUNT_AVG_30D, + TX_CNT_1D, + TX_CNT_7D, + TX_CNT_30D + FROM {fs_db}.{fs_data_schema}.ft_customer_transactions + """) + +# Create a feature view on top of these features +customer_transactions_fv = FeatureView( + name="customer_transactions_fv", + entities=[customer], + feature_df=customers_transactions_df, + timestamp_col="TX_DATETIME", + refresh_freq=None, + desc="Customer transaction features with window aggregates") + +# Register the feature view for use beyond the session +customer_transactions_fv = fs.register_feature_view( + feature_view=customer_transactions_fv, + version="1", + #overwrite=True, + block=True) + +``` + +### Search and discover features in the Snowflake UI + +Now, with features created, teams can view their features directly in the Snowflake UI, as shown below. This enables teams to easily search and browse features, all governed through Snowflake’s role-based access control (RBAC). + + + +### Generate training dataset + +Now that the feature view is created, data scientists produce a [training dataset](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/modeling#generating-tables-for-training) that uses the feature view. In our example, whether the data scientist is building a fraud or segmentation model, they will retrieve point-in-time correct features for a customer at a specific point in time using the Feature Store’s `generate_training_set` method. + +To generate the training set, we need to supply a spine dataframe, representing the entities and timestamp values that we will need to retrieve features for. The following example shows this using a few records, although teams can leverage other tables to produce this spine. + +```python +spine_df = session.create_dataframe( + [ + ('1', '3937', "2019-05-01 00:00"), + ('2', '2', "2019-05-01 00:00"), + ('3', '927', "2019-05-01 00:00"), + ], + schema=["INSTANCE_ID", "CUSTOMER_ID", "EVENT_TIMESTAMP"]) + +train_dataset = fs.generate_dataset( + name= "customers_fv", + version= "1_0", + spine_df=spine_df, + features=[customer_transactions_fv], + spine_timestamp_col= "EVENT_TIMESTAMP", + spine_label_cols = [] +) + +``` + +Now that we have produced the training dataset, let’s see what it looks like. + + + +### Train and deploy a model + +Now with this training set, data scientists can use [Snowflake Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/index) and [Snowpark ML Modeling](https://docs.snowflake.com/en/developer-guide/snowflake-ml/modeling) to use familiar Python frameworks for additional preprocessing, feature engineering, and model training all within Snowflake. The model can be registered in the Snowflake [Model Registry](https://docs.snowflake.com/en/developer-guide/snowflake-ml/model-registry/overview) for secure model management. Note that we will leave the model training for you as part of this exercise. + +### Retrieve features for predictions + +For inference, data pipelines retrieve feature values using the [retrieve_feature_values](https://docs.snowflake.com/en/developer-guide/snowflake-ml/feature-store/modeling#retrieving-features-and-making-predictions) method. These retrieved values can be fed directly to a model’s predict capability in your Python session using a developed model or by invoking a model’s predict method from Snowflake’s Model Registry. For batch scoring purposes, teams can build this entire pipeline using [Snowflake ML](https://docs.snowflake.com/en/developer-guide/snowflake-ml/overview). The following code demonstrates how the features are retrieved using this method. + +```python +infernce_spine = session.create_dataframe( + [ + ('1', '3937', "2019-07-01 00:00"), + ('2', '2', "2019-07-01 00:00"), + ('3', '927', "2019-07-01 00:00"), + ], + schema=["INSTANCE_ID", "CUSTOMER_ID", "EVENT_TIMESTAMP"]) + +inference_dataset = fs.retrieve_feature_values( + spine_df=infernce_spine, + features=[customer_transactions_fv], + spine_timestamp_col="EVENT_TIMESTAMP", +) + +inference_dataset.to_pandas() + +``` + +Here’s an example view of our features produced for model inferencing. + + + +## Conclusion + +We’ve just seen how quickly and easily you can begin to develop features through dbt and leverage the Snowflake Feature Store to deliver predictive modeling as part of your data pipelines. The ability to build and deploy ML models, including integrating feature storage, data transformation, and ML logic within a single platform, simplifies the entire ML life cycle. Combining this new power with the well-established partnership of dbt and Snowflake unlocks even more potential for organizations to safely build and explore new AI/ML use cases and drive further collaboration in the organization. + +The code used in the examples above is publicly available on [GitHub](https://github.com/sfc-gh-rpettus/dbt-feature-store). Also, you can run a full example yourself in this [quickstart guide](https://quickstarts.snowflake.com/guide/getting-started-with-feature-store-and-dbt/index.html?index=..%2F..index#0) from the Snowflake docs. diff --git a/website/blog/authors.yml b/website/blog/authors.yml index 85f05a545f9..271130a477d 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,7 +1,7 @@ --- amy_chen: image_url: /img/blog/authors/achen.png - job_title: Product Ecosystem Manager + job_title: Product Manager links: - icon: fa-linkedin url: https://www.linkedin.com/in/yuanamychen/ @@ -386,6 +386,14 @@ lucas_bergodias: job_title: Analytics Engineer name: Lucas Bergo Dias organization: Indicium Tech +luis_leon: + image_url: /img/blog/authors/luis-leon.png + job_title: Partner Solutions Architect + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/luis-leon-03965463/ + name: Luis Leon + organization: dbt Labs matt_winkler: description: Matt is an ex-data scientist who chose to embrace the simplicity of using SQL to manage and testing data pipelines with dbt. He previously worked as a hands-on ML practitioner, and consulted with Fortune 500 clients to build and maintain ML Ops pipelines using (mostly) AWS Sagemaker. He lives in the Denver area, and you can say hello on dbt Slack or on LinkedIn. image_url: /img/blog/authors/matt-winkler.jpeg @@ -449,6 +457,14 @@ pedro_brito_de_sa: url: https://www.linkedin.com/in/pbritosa/ name: Pedro Brito de Sa organization: Sage +randy_pettus: + image_url: /img/blog/authors/randy-pettus.png + job_title: Senior Partner Sales Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/randypettus/ + name: Randy Pettus + organization: Snowflake rastislav_zdechovan: image_url: /img/blog/authors/rastislav-zdechovan.png job_title: Analytics Engineer diff --git a/website/blog/ctas.yml b/website/blog/ctas.yml index ac56d4cc749..1f9b13afa7b 100644 --- a/website/blog/ctas.yml +++ b/website/blog/ctas.yml @@ -25,3 +25,8 @@ subheader: Coalesce is the premiere analytics engineering conference! Sign up now for innovation, collaboration, and inspiration. Don't miss out! button_text: Register now url: https://coalesce.getdbt.com/register +- name: coalesce_2024_catchup + header: Missed Coalesce 2024? + subheader: Catch up on Coalesce 2024 and register to access a select number of on-demand sessions. + button_text: Register and watch + url: https://coalesce.getdbt.com/register/online diff --git a/website/blog/metadata.yml b/website/blog/metadata.yml index d0009fd62c4..8b53a7a2a04 100644 --- a/website/blog/metadata.yml +++ b/website/blog/metadata.yml @@ -2,7 +2,7 @@ featured_image: "" # This CTA lives in right sidebar on blog index -featured_cta: "coalesce_2024_signup" +featured_cta: "coalesce_2024_catchup" # Show or hide hero title, description, cta from blog index show_title: true diff --git a/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md b/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md index f1fb7422acf..a884de90c49 100644 --- a/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md +++ b/website/docs/best-practices/how-we-mesh/mesh-4-implementation.md @@ -80,7 +80,7 @@ models: ## Split your projects 1. **Move your grouped models into a subfolder**. This will include any model in the selected group, it's associated YAML entry, as well as its parent or child resources as appropriate depending on where this group sits in your DAG. - 1. Note that just like in your dbt project, circular refereneces are not allowed! Project B cannot have parents and children in Project A, for example. + 1. Note that just like in your dbt project, circular references are not allowed! Project B cannot have parents and children in Project A, for example. 2. **Create a new `dbt_project.yml` file** in the subdirectory. 3. **Copy any macros** used by the resources you moved. 4. **Create a new `packages.yml` file** in your subdirectory with the packages that are used by the resources you moved. diff --git a/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md b/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md index c7522bf12eb..9358b507acc 100644 --- a/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md +++ b/website/docs/best-practices/how-we-structure/5-the-rest-of-the-project.md @@ -102,12 +102,14 @@ We’ve focused heavily thus far on the primary area of action in our dbt projec ### Project splitting -One important, growing consideration in the analytics engineering ecosystem is how and when to split a codebase into multiple dbt projects. Our present stance on this for most projects, particularly for teams starting out, is straightforward: you should avoid it unless you have no other option or it saves you from an even more complex workaround. If you do have the need to split up your project, it’s completely possible through the use of private packages, but the added complexity and separation is, for most organizations, a hindrance, not a help, at present. That said, this is very likely subject to change! [We want to create a world where it’s easy to bring lots of dbt projects together into a cohesive lineage](https://github.com/dbt-labs/dbt-core/discussions/5244). In a world where it’s simple to break up monolithic dbt projects into multiple connected projects, perhaps inside of a modern mono repo, the calculus will be different, and the below situations we recommend against may become totally viable. So watch this space! +One important, growing consideration in the analytics engineering ecosystem is how and when to split a codebase into multiple dbt projects. Currently, our advice for most teams, especially those just starting, is fairly simple: in most cases, we recommend doing so with [dbt Mesh](/best-practices/how-we-mesh/mesh-1-intro)! dbt Mesh allows organizations to handle complexity by connecting several dbt projects rather than relying on one big, monolithic project. This approach is designed to speed up development while maintaining governance. -- ❌ **Business groups or departments.** Conceptual separations within the project are not a good reason to split up your project. Splitting up, for instance, marketing and finance modeling into separate projects will not only add unnecessary complexity but destroy the unifying effect of collaborating across your organization on cohesive definitions and business logic. -- ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. +As breaking up monolithic dbt projects into smaller, connected projects, potentially within a modern mono repo becomes easier, the scenarios we currently advise against may soon become feasible. So watch this space! + +- ✅ **Business groups or departments.** Conceptual separations within the project are the primary reason to split up your project. This allows your business domains to own their own data products and still collaborate using dbt Mesh. For more information about dbt Mesh, please refer to our [dbt Mesh FAQs](/best-practices/how-we-mesh/mesh-5-faqs). - ✅ **Data governance.** Structural, organizational needs — such as data governance and security — are one of the few worthwhile reasons to split up a project. If, for instance, you work at a healthcare company with only a small team cleared to access raw data with PII in it, you may need to split out your staging models into their own projects to preserve those policies. In that case, you would import your staging project into the project that builds on those staging models as a [private package](https://docs.getdbt.com/docs/build/packages/#private-packages). - ✅ **Project size.** At a certain point, your project may grow to have simply too many models to present a viable development experience. If you have 1000s of models, it absolutely makes sense to find a way to split up your project. +- ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. ## Final considerations diff --git a/website/docs/docs/build/data-tests.md b/website/docs/docs/build/data-tests.md index ae3ac9225db..b4f25a3d111 100644 --- a/website/docs/docs/build/data-tests.md +++ b/website/docs/docs/build/data-tests.md @@ -66,9 +66,25 @@ having total_amount < 0 -The name of this test is the name of the file: `assert_total_payment_amount_is_positive`. Simple enough. +The name of this test is the name of the file: `assert_total_payment_amount_is_positive`. -Singular data tests are easy to write—so easy that you may find yourself writing the same basic structure over and over, only changing the name of a column or model. By that point, the test isn't so singular! In that case, we recommend... +To add a data test to your project, add a `.yml` file to your `tests` directory, for example, `tests/schema.yml` with the following content: + + + +```yaml +version: 2 +data_tests: + - name: assert_total_payment_amount_is_positive + description: > + Refunds have a negative amount, so the total amount should always be >= 0. + Therefore return records where total amount < 0 to make the test fail. + +``` + + + +Singular data tests are so easy that you may find yourself writing the same basic structure repeatedly, only changing the name of a column or model. By that point, the test isn't so singular! In that case, we recommend generic data tests. ## Generic data tests Certain data tests are generic: they can be reused over and over again. A generic data test is defined in a `test` block, which contains a parametrized query and accepts arguments. It might look like: diff --git a/website/docs/docs/build/dimensions.md b/website/docs/docs/build/dimensions.md index 7ad52704c4f..170626ee7cc 100644 --- a/website/docs/docs/build/dimensions.md +++ b/website/docs/docs/build/dimensions.md @@ -41,7 +41,7 @@ Refer to the following example to see how dimensions are used in a semantic mode semantic_models: - name: transactions description: A record for every transaction that takes place. Carts are considered multiple transactions for each SKU. - model: {{ ref("fact_transactions") }} + model: {{ ref('fact_transactions') }} defaults: agg_time_dimension: order_date # --- entities --- @@ -122,7 +122,7 @@ dbt sl query --metrics users_created,users_deleted --group-by metric_time__year mf query --metrics users_created,users_deleted --group-by metric_time__year --order-by metric_time__year ``` -You can set `is_partition` for time to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation details (hourly, daily, weekly, and so on). +You can set `is_partition` for time to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation details (daily, weekly, and so on). @@ -161,6 +161,8 @@ measures: + + `time_granularity` specifies the grain of a time dimension. MetricFlow will transform the underlying column to the specified granularity. For example, if you add hourly granularity to a time dimension column, MetricFlow will run a `date_trunc` function to convert the timestamp to hourly. You can easily change the time grain at query time and aggregate it to a coarser grain, for example, from hourly to monthly. However, you can't go from a coarser grain to a finer grain (monthly to hourly). Our supported granularities are: @@ -172,6 +174,7 @@ Our supported granularities are: * hour * day * week +* month * quarter * year @@ -204,6 +207,50 @@ measures: agg: sum ``` + + + + +`time_granularity` specifies the grain of a time dimension. MetricFlow will transform the underlying column to the specified granularity. For example, if you add daily granularity to a time dimension column, MetricFlow will run a `date_trunc` function to convert the timestamp to daily. You can easily change the time grain at query time and aggregate it to a coarser grain, for example, from daily to monthly. However, you can't go from a coarser grain to a finer grain (monthly to daily). + +Our supported granularities are: +* day +* week +* month +* quarter +* year + +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the coarsest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. + +```yaml +dimensions: + - name: created_at + type: time + label: "Date of creation" + expr: ts_created # ts_created is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + - name: deleted_at + type: time + label: "Date of deletion" + expr: ts_deleted # ts_deleted is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + +measures: + - name: users_deleted + expr: 1 + agg: sum + agg_time_dimension: deleted_at + - name: users_created + expr: 1 + agg: sum +``` + + + @@ -313,7 +360,7 @@ Additionally, the entity is tagged as `natural` to differentiate it from a `prim semantic_models: - name: sales_person_tiers description: SCD Type II table of tiers for salespeople - model: {{ref(sales_person_tiers)}} + model: {{ ref('sales_person_tiers') }} defaults: agg_time_dimension: tier_start @@ -355,7 +402,7 @@ semantic_models: There is a transaction, product, sales_person, and customer id for every transaction. There is only one transaction id per transaction. The `metric_time` or date is reflected in UTC. - model: {{ ref(fact_transactions) }} + model: {{ ref('fact_transactions') }} defaults: agg_time_dimension: metric_time diff --git a/website/docs/docs/build/environment-variables.md b/website/docs/docs/build/environment-variables.md index 955bb79ed22..c26425401a7 100644 --- a/website/docs/docs/build/environment-variables.md +++ b/website/docs/docs/build/environment-variables.md @@ -101,7 +101,7 @@ dbt Cloud has a number of pre-defined variables built in. Variables are set auto The following environment variable is set automatically for the dbt Cloud IDE: -- `DBT_CLOUD_GIT_BRANCH`: Provides the development Git branch name in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). +- `DBT_CLOUD_GIT_BRANCH` — Provides the development Git branch name in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). - Available in dbt v 1.6 and later. - The variable changes when the branch is changed. - Doesn't require restarting the IDE after a branch change. @@ -113,26 +113,26 @@ Use case — This is useful in cases where you want to dynamically use the G The following environment variables are set automatically: -- `DBT_ENV`: This key is reserved for the dbt Cloud application and will always resolve to 'prod'. For deployment runs only. -- `DBT_CLOUD_ENVIRONMENT_NAME`: The name of the dbt Cloud environment in which `dbt` is running. -- `DBT_CLOUD_ENVIRONMENT_TYPE`: The type of dbt Cloud environment in which `dbt` is running. The valid values are `development` or `deployment`. +- `DBT_ENV` — This key is reserved for the dbt Cloud application and will always resolve to 'prod'. For deployment runs only. +- `DBT_CLOUD_ENVIRONMENT_NAME` — The name of the dbt Cloud environment in which `dbt` is running. +- `DBT_CLOUD_ENVIRONMENT_TYPE` — The type of dbt Cloud environment in which `dbt` is running. The valid values are `development` or `deployment`. #### Run details -- `DBT_CLOUD_PROJECT_ID`: The ID of the dbt Cloud Project for this run -- `DBT_CLOUD_JOB_ID`: The ID of the dbt Cloud Job for this run -- `DBT_CLOUD_RUN_ID`: The ID of this particular run -- `DBT_CLOUD_RUN_REASON_CATEGORY`: The "category" of the trigger for this run (one of: `scheduled`, `github_pull_request`, `gitlab_merge_request`, `azure_pull_request`, `other`) -- `DBT_CLOUD_RUN_REASON`: The specific trigger for this run (eg. `Scheduled`, `Kicked off by `, or custom via `API`) -- `DBT_CLOUD_ENVIRONMENT_ID`: The ID of the environment for this run -- `DBT_CLOUD_ACCOUNT_ID`: The ID of the dbt Cloud account for this run +- `DBT_CLOUD_PROJECT_ID` — The ID of the dbt Cloud Project for this run +- `DBT_CLOUD_JOB_ID` — The ID of the dbt Cloud Job for this run +- `DBT_CLOUD_RUN_ID` — The ID of this particular run +- `DBT_CLOUD_RUN_REASON_CATEGORY` — The "category" of the trigger for this run (one of: `scheduled`, `github_pull_request`, `gitlab_merge_request`, `azure_pull_request`, `other`) +- `DBT_CLOUD_RUN_REASON` — The specific trigger for this run (eg. `Scheduled`, `Kicked off by `, or custom via `API`) +- `DBT_CLOUD_ENVIRONMENT_ID` — The ID of the environment for this run +- `DBT_CLOUD_ACCOUNT_ID` — The ID of the dbt Cloud account for this run #### Git details _The following variables are currently only available for GitHub, GitLab, and Azure DevOps PR builds triggered via a webhook_ -- `DBT_CLOUD_PR_ID`: The Pull Request ID in the connected version control system -- `DBT_CLOUD_GIT_SHA`: The git commit SHA which is being run for this Pull Request build +- `DBT_CLOUD_PR_ID` — The Pull Request ID in the connected version control system +- `DBT_CLOUD_GIT_SHA` — The git commit SHA which is being run for this Pull Request build ### Example usage diff --git a/website/docs/docs/build/incremental-microbatch.md b/website/docs/docs/build/incremental-microbatch.md index ae598bbe05f..18122af4b7b 100644 --- a/website/docs/docs/build/incremental-microbatch.md +++ b/website/docs/docs/build/incremental-microbatch.md @@ -24,11 +24,21 @@ Each "batch" corresponds to a single bounded time period (by default, a single d ### Example -A `sessions` model is aggregating and enriching data that comes from two other models: +A `sessions` model aggregates and enriches data that comes from two other models. - `page_views` is a large, time-series table. It contains many rows, new records almost always arrive after existing ones, and existing records rarely update. - `customers` is a relatively small dimensional table. Customer attributes update often, and not in a time-based manner — that is, older customers are just as likely to change column values as newer customers. -The `page_view_start` column in `page_views` is configured as that model's `event_time`. The `customers` model does not configure an `event_time`. Therefore, each batch of `sessions` will filter `page_views` to the equivalent time-bounded batch, and it will not filter `sessions` (a full scan for every batch). +The `page_view_start` column in `page_views` is configured as that model's `event_time`. The `customers` model does not configure an `event_time`. Therefore, each batch of `sessions` will filter `page_views` to the equivalent time-bounded batch, and it will not filter `customers` (a full scan for every batch). + + + +```yaml +models: + - name: page_views + config: + event_time: page_view_start +``` + We run the `sessions` model on October 1, 2024, and then again on October 2. It produces the following queries: @@ -36,6 +46,8 @@ We run the `sessions` model on October 1, 2024, and then again on October 2. It +The `event_time` for the `sessions` model is set to `session_start`, which marks the beginning of a user’s session on the website. This setting allows dbt to combine multiple page views (each tracked by their own `page_view_start` timestamps) into a single session. This way, `session_start` differentiates the timing of individual page views from the broader timeframe of the entire user session. + ```sql @@ -43,7 +55,8 @@ We run the `sessions` model on October 1, 2024, and then again on October 2. It materialized='incremental', incremental_strategy='microbatch', event_time='session_start', - begin='2020-01-01' + begin='2020-01-01', + batch_size='day' ) }} with page_views as ( @@ -60,7 +73,13 @@ customers as ( ), -... +select + page_views.id as session_id, + page_views.page_view_start as session_start, + customers.* + from page_views + left join customers + on page_views.customer_id = customer.id ``` @@ -131,7 +150,7 @@ customers as ( dbt will instruct the data platform to take the result of each batch query and insert, update, or replace the contents of the `analytics.sessions` table for the same day of data. To perform this operation, dbt will use the most efficient atomic mechanism for "full batch" replacement that is available on each data platform. -It does not matter whether the table already contains data for that day, or not. Given the same input data, no matter how many times a batch is reprocessed, the resulting table is the same. +It does not matter whether the table already contains data for that day. Given the same input data, the resulting table is the same no matter how many times a batch is reprocessed. @@ -143,7 +162,7 @@ Several configurations are relevant to microbatch models, and some are required: |----------|------|---------------|---------| | `event_time` | Column (required) | The column indicating "at what time did the row occur." Required for your microbatch model and any direct parents that should be filtered. | N/A | | `begin` | Date (required) | The "beginning of time" for the microbatch model. This is the starting point for any initial or full-refresh builds. For example, a daily-grain microbatch model run on `2024-10-01` with `begin = '2023-10-01` will process 366 batches (it's a leap year!) plus the batch for "today." | N/A | -| `batch_size` | String (optional) | The granularity of your batches. The default is `day` (and currently this is the only granularity supported). | `day` | +| `batch_size` | String (required) | The granularity of your batches. Supported values are `hour`, `day`, `month`, and `year` | N/A | | `lookback` | Integer (optional) | Process X batches prior to the latest bookmark to capture late-arriving records. | `0` | @@ -165,11 +184,11 @@ During standard incremental runs, dbt will process batches according to the curr -**Note:** If there’s an upstream model that configures `event_time`, but you *don’t* want the reference to it to be filtered, you can specify `ref('upstream_model').render()` to opt-out of auto-filtering. This isn't generally recommended — most models which configure `event_time` are fairly large, and if the reference is not filtered, each batch will perform a full scan of this input table. +**Note:** If there’s an upstream model that configures `event_time`, but you *don’t* want the reference to it to be filtered, you can specify `ref('upstream_model').render()` to opt-out of auto-filtering. This isn't generally recommended — most models that configure `event_time` are fairly large, and if the reference is not filtered, each batch will perform a full scan of this input table. ### Backfills -Whether to fix erroneous source data, or retroactively apply a change in business logic, you may need to reprocess a large amount of historical data. +Whether to fix erroneous source data or retroactively apply a change in business logic, you may need to reprocess a large amount of historical data. Backfilling a microbatch model is as simple as selecting it to run or build, and specifying a "start" and "end" for `event_time`. As always, dbt will process the batches between the start and end as independent queries. @@ -194,7 +213,7 @@ For now, dbt assumes that all values supplied are in UTC: - `--event-time-start` - `--event-time-end` -While we may consider adding support for custom timezones in the future, we also believe that defining these values in UTC makes everyone's lives easier. +While we may consider adding support for custom time zones in the future, we also believe that defining these values in UTC makes everyone's lives easier. ## How `microbatch` compares to other incremental strategies? @@ -251,15 +270,15 @@ select * from {{ ref('stg_events') }} -- this ref will be auto-filtered -Where you’ve also set an `event_time` for the model’s direct parents - in this case `stg_events`: +Where you’ve also set an `event_time` for the model’s direct parents - in this case, `stg_events`: ```yaml models: - name: stg_events - config: - event_time: my_time_field + config: + event_time: my_time_field ``` diff --git a/website/docs/docs/build/metricflow-commands.md b/website/docs/docs/build/metricflow-commands.md index d9e01bede71..29a9e70acd4 100644 --- a/website/docs/docs/build/metricflow-commands.md +++ b/website/docs/docs/build/metricflow-commands.md @@ -27,6 +27,9 @@ dbt Cloud jobs support the `dbt sl validate` command to [automatically test your In dbt Cloud, run MetricFlow commands directly in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). For dbt Cloud CLI users, MetricFlow commands are embedded in the dbt Cloud CLI, which means you can immediately run them once you install the dbt Cloud CLI and don't need to install MetricFlow separately. You don't need to manage versioning because your dbt Cloud account will automatically manage the versioning for you. + + +Note: The **Defer to staging/production** [toggle](/docs/cloud/about-cloud-develop-defer#defer-in-the-dbt-cloud-ide) button doesn't apply when running Semantic Layer commands in the dbt Cloud IDE. To use defer for Semantic layer commands in the IDE, toggle the button on and manually add the `--defer` flag to the command. This is a temporary workaround and will be available soon. diff --git a/website/docs/docs/build/metricflow-time-spine.md b/website/docs/docs/build/metricflow-time-spine.md index 2965b623f13..e932fb36f53 100644 --- a/website/docs/docs/build/metricflow-time-spine.md +++ b/website/docs/docs/build/metricflow-time-spine.md @@ -1,13 +1,15 @@ --- title: MetricFlow time spine id: metricflow-time-spine -description: "MetricFlow expects a default timespine table called metricflow_time_spine" +description: "MetricFlow expects a default time spine table called metricflow_time_spine" sidebar_label: "MetricFlow time spine" tags: [Metrics, Semantic Layer] --- + -It's common in analytics engineering to have a date dimension or "time spine" table as a base table for different types of time-based joins and aggregations. The structure of this table is typically a base column of daily or hourly dates, with additional columns for other time grains, like fiscal quarters, defined based on the base column. You can join other tables to the time spine on the base column to calculate metrics like revenue at a point in time, or to aggregate to a specific time grain. + +It's common in analytics engineering to have a date dimension or "time spine" table as a base table for different types of time-based joins and aggregations. The structure of this table is typically a base column of daily or hourly dates, with additional columns for other time grains, like fiscal quarters, defined based on the base column. You can join other tables to the time spine on the base column to calculate metrics like revenue at a point in time, or to aggregate to a specific time grain. MetricFlow requires you to define at least one dbt model which provides a time-spine, and then specify (in YAML) the columns to be used for time-based joins. MetricFlow will join against the time-spine model for the following types of metrics and dimensions: @@ -21,41 +23,47 @@ To see the generated SQL for the metric and dimension types that use time spine ## Configuring time spine in YAML -- The [`models` key](/reference/model-properties) for the time spine must be in your `models/` directory. -- Each time spine is a normal dbt model with extra configurations that tell dbt and MetricFlow how to use specific columns by defining their properties. -- You likely already have a calendar table in your project which you can use. If you don't, review the [example time-spine tables](#example-time-spine-tables) for sample code. -- You add the configurations under the `time_spine` key for that [model's properties](/reference/model-properties), just as you would add a description or tests. + Time spine models are normal dbt models with extra configurations that tell dbt and MetricFlow how to use specific columns by defining their properties. Add the [`models` key](/reference/model-properties) for the time spine in your `models/` directory. If your project already includes a calendar table or date dimension, you can configure that table as a time spine. Otherwise, review the [example time-spine tables](#example-time-spine-tables) to create one. + + Some things to note when configuring time spine models: + +- Add the configurations under the `time_spine` key for that [model's properties](/reference/model-properties), just as you would add a description or tests. - You only need to configure time-spine models that the Semantic Layer should recognize. - At a minimum, define a time-spine table for a daily grain. - You can optionally define additional time-spine tables for different granularities, like hourly. Review the [granularity considerations](#granularity-considerations) when deciding which tables to create. - If you're looking to specify the grain of a time dimension so that MetricFlow can transform the underlying column to the required granularity, refer to the [Time granularity documentation](/docs/build/dimensions?dimension=time_gran) -For example, given the following directory structure, you can create two time spine configurations, `time_spine_hourly` and `time_spine_daily`. MetricFlow supports granularities ranging from milliseconds to years. Refer to the [Dimensions page](/docs/build/dimensions?dimension=time_gran#time) (time_granularity tab) to find the full list of supported granularities. - :::tip -Previously, you had to create a model called `metricflow_time_spine` in your dbt project. Now, if your project already includes a date dimension or time spine table, you can simply configure MetricFlow to use that table by updating the `model` setting in the Semantic Layer. - -If you don’t have a date dimension table, you can still create one by using the following code snippet to build your time spine model. +If you previously used a model called `metricflow_time_spine`, you no longer need to create this specific model. You can now configure MetricFlow to use any date dimension or time spine table already in your project by updating the `model` setting in the Semantic Layer. +If you don’t have a date dimension table, you can still create one by using the code snippet in the [next section](#creating-a-time-spine-table) to build your time spine model. ::: - +### Creating a time spine table + +MetricFlow supports granularities ranging from milliseconds to years. Refer to the [Dimensions page](/docs/build/dimensions?dimension=time_gran#time) (time_granularity tab) to find the full list of supported granularities. + +To create a time spine table from scratch, you can do so by adding the following code to your dbt project. +This example creates a time spine at an hourly grain and a daily grain: `time_spine_hourly` and `time_spine_daily`. ```yaml -[models:](/reference/model-properties) - - name: time_spine_hourly - description: "my favorite time spine" +[models:](/reference/model-properties) +# Hourly time spine + - name: time_spine_hourly + description: my favorite time spine time_spine: - standard_granularity_column: date_hour # column for the standard grain of your table, must be date time type." + standard_granularity_column: date_hour # column for the standard grain of your table, must be date time type. custom_granularities: - name: fiscal_year column_name: fiscal_year_column columns: - name: date_hour granularity: hour # set granularity at column-level for standard_granularity_column + +# Daily time spine - name: time_spine_daily time_spine: standard_granularity_column: date_day # column for the standard grain of your table @@ -66,6 +74,9 @@ If you don’t have a date dimension table, you can still create one by using th + + + -For an example project, refer to our [Jaffle shop](https://github.com/dbt-labs/jaffle-sl-template/blob/main/models/marts/_models.yml) example. - - - -- The previous configuration demonstrates a time spine model called `time_spine_daily`. It sets the time spine configurations under the `time_spine` key. -- The `standard_granularity_column` is the column that maps to one of our [standard granularities](/docs/build/dimensions?dimension=time_gran). The grain of this column must be finer or equal in size to the granularity of all custom granularity columns in the same model. In this case, it's hourly. -- It needs to reference a column defined under the `columns` key, in this case, `date_hour`. -- MetricFlow will use the `standard_granularity_column` as the join key when joining the time spine table to other source table. -- Here, the granularity of the `standard_granularity_column` is set at the column level, in this case, `hour`. - -Additionally, [the `custom_granularities` field](#custom-calendar), (available in dbt v1.9 and higher) lets you specify non-standard time periods like `fiscal_year` or `retail_month` that your organization may use. - - +- This example configuration shows a time spine model called `time_spine_hourly` and `time_spine_daily`. It sets the time spine configurations under the `time_spine` key. +- The `standard_granularity_column` is the column that maps to one of our [standard granularities](/docs/build/dimensions?dimension=time_gran). This column must be set under the `columns` key and should have a grain that is finer or equal to any custom granularity columns defined in the same model. + - It needs to reference a column defined under the `columns` key, in this case, `date_hour` and `date_day`, respectively. + - It sets the granularity at the column-level using the `granularity` key, in this case, `hour` and `day`, respectively. +- MetricFlow will use the `standard_granularity_column` as the join key when joining the time spine table to another source table. +- [The `custom_granularities` field](#custom-calendar), (available in Versionless and dbt v1.9 and higher) lets you specify non-standard time periods like `fiscal_year` or `retail_month` that your organization may use. - - -If you need to create a time spine table from scratch, you can do so by adding the following code to your dbt project. -The example creates a time spine at a daily grain and an hourly grain. A few things to note when creating time spine models: -* MetricFlow will use the time spine with the largest compatible granularity for a given query to ensure the most efficient query possible. For example, if you have a time spine at a monthly grain, and query a dimension at a monthly grain, MetricFlow will use the monthly time spine. If you only have a daily time spine, MetricFlow will use the daily time spine and date_trunc to month. -* You can add a time spine for each granularity you intend to use if query efficiency is more important to you than configuration time, or storage constraints. For most engines, the query performance difference should be minimal and transforming your time spine to a coarser grain at query time shouldn't add significant overhead to your queries. -* We recommend having a time spine at the finest grain used in any of your dimensions to avoid unexpected errors. i.e., if you have dimensions at an hourly grain, you should have a time spine at an hourly grain. - - -Now, break down the configuration above. It's pointing to a model called `time_spine_daily`, and all the configuration is colocated with the rest of the [model's properties](/reference/model-properties). It sets the time spine configurations under the `time_spine` key. The `standard_granularity_column` is the lowest grain of the table, in this case, it's hourly. It needs to reference a column defined under the columns key, in this case, `date_hour`. Use the `standard_granularity_column` as the join key for the time spine table when joining tables in MetricFlow. Here, the granularity of the `standard_granularity_column` is set at the column level, in this case, `hour`. +For an example project, refer to our [Jaffle shop](https://github.com/dbt-labs/jaffle-sl-template/blob/main/models/marts/_models.yml) example. ### Considerations when choosing which granularities to create{#granularity-considerations} @@ -298,13 +294,165 @@ and date_hour < dateadd(day, 30, current_timestamp()) + + + + + + +MetricFlow uses a time spine table to construct cumulative metrics. By default, MetricFlow expects the time spine table to be named `metricflow_time_spine` and doesn't support using a different name. For supported granularities, refer to the [dimensions](/docs/build/dimensions?dimension=time_gran#time) page. + +To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: + +### Daily + + + + +```sql +{{ + config( + materialized = 'table', + ) +}} + +with days as ( + + {{ + dbt_utils.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2025','mm/dd/yyyy')" + ) + }} + +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * from final +-- filter the time spine to a specific range +where date_day > dateadd(year, -4, current_timestamp()) +and date_hour < dateadd(day, 30, current_timestamp()) +``` + + + + + + + +```sql +{{ + config( + materialized = 'table', + ) +}} + +with days as ( + + {{ + dbt.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2025','mm/dd/yyyy')" + ) + }} + +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * from final +where date_day > dateadd(year, -4, current_timestamp()) +and date_hour < dateadd(day, 30, current_timestamp()) +``` + + + + +### Daily (BigQuery) + +Use this model if you're using BigQuery. BigQuery supports `DATE()` instead of `TO_DATE()`: + + + + + +```sql +{{config(materialized='table')}} +with days as ( + {{dbt_utils.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2025,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final +-- filter the time spine to a specific range +where date_day > dateadd(year, -4, current_timestamp()) +and date_hour < dateadd(day, 30, current_timestamp()) +``` + + + + + + + +```sql +{{config(materialized='table')}} +with days as ( + {{dbt.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2025,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final +-- filter the time spine to a specific range +where date_day > dateadd(year, -4, current_timestamp()) +and date_hour < dateadd(day, 30, current_timestamp()) +``` + + + + +You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but finer grains are only supported in versions 1.9 and higher. + + + + ## Custom calendar -Being able to configure custom calendars, such as a fiscal calendar, is available in [dbt Cloud Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) or dbt Core [v1.9 and above](/docs/dbt-versions/core). +The ability to configure custom calendars, such as a fiscal calendar, is available in [dbt Cloud Versionless](/docs/dbt-versions/versionless-cloud) or dbt Core [v1.9 and higher](/docs/dbt-versions/core). + +To access this feature, [upgrade to Versionless](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless) or your dbt Core version to v1.9 or higher. -To access this feature, [upgrade to Versionless](/docs/dbt-versions/versionless-cloud) or dbt Core v1.9 and above. @@ -337,6 +485,6 @@ models: #### Coming soon -Note that features like calculating offsets and period-over-period will be supported soon. +Note that features like calculating offsets and period-over-period will be supported soon! diff --git a/website/docs/docs/build/metrics-overview.md b/website/docs/docs/build/metrics-overview.md index ea06ea85526..7021a6d7330 100644 --- a/website/docs/docs/build/metrics-overview.md +++ b/website/docs/docs/build/metrics-overview.md @@ -92,7 +92,18 @@ import SLCourses from '/snippets/_sl-course.md'; ## Default granularity for metrics -It's possible to define a default time granularity for metrics if it's different from the granularity of the default aggregation time dimensions (`metric_time`). This is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. The granularity can be set using the `time_granularity` parameter on the metric, and defaults to `day`. If day is not available because the dimension is defined at a coarser granularity, it will default to the defined granularity for the dimension. + +Default time granularity for metrics is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. + +To set the default time granularity for metrics, you need to be on dbt Cloud Versionless or dbt v1.9 and higher. + + + + + +It's possible to define a default time granularity for metrics if it's different from the granularity of the default aggregation time dimensions (`metric_time`). This is useful if your time dimension has a very fine grain, like second or hour, but you typically query metrics rolled up at a coarser grain. + +The granularity can be set using the `time_granularity` parameter on the metric, and defaults to `day`. If day is not available because the dimension is defined at a coarser granularity, it will default to the defined granularity for the dimension. ### Example You have a semantic model called `orders` with a time dimension called `order_time`. You want the `orders` metric to roll up to `monthly` by default; however, you want the option to look at these metrics hourly. You can set the `time_granularity` parameter on the `order_time` dimension to `hour`, and then set the `time_granularity` parameter in the metric to `month`. @@ -117,6 +128,7 @@ semantic_models: name: orders time_granularity: month -- Optional, defaults to day ``` + ## Conversion metrics @@ -270,6 +282,8 @@ A filter is configured using Jinja templating. Use the following syntax to refer Refer to [Metrics as dimensions](/docs/build/ref-metrics-in-filters) for details on how to use metrics as dimensions with metric filters: + + ```yaml @@ -283,10 +297,30 @@ filter: | {{ TimeDimension('time_dimension', 'granularity') }} filter: | - {{ Metric('metric_name', group_by=['entity_name']) }} # Available in v1.8 or with versionless dbt Cloud. + {{ Metric('metric_name', group_by=['entity_name']) }} + ``` + + + + + + + +```yaml +filter: | + {{ Entity('entity_name') }} + +filter: | + {{ Dimension('primary_entity__dimension_name') }} + +filter: | + {{ TimeDimension('time_dimension', 'granularity') }} + +``` + For example, if you want to filter for the order date dimension grouped by month, use the following syntax: diff --git a/website/docs/docs/build/ratio-metrics.md b/website/docs/docs/build/ratio-metrics.md index cc1d13b7835..fdaeb878450 100644 --- a/website/docs/docs/build/ratio-metrics.md +++ b/website/docs/docs/build/ratio-metrics.md @@ -24,6 +24,8 @@ Ratio allows you to create a ratio between two metrics. You simply specify a num The following displays the complete specification for ratio metrics, along with an example. + + ```yaml metrics: - name: The metric name # Required @@ -40,11 +42,19 @@ metrics: filter: Filter for the denominator # Optional alias: Alias for the denominator # Optional ``` + For advanced data modeling, you can use `fill_nulls_with` and `join_to_timespine` to [set null metric values to zero](/docs/build/fill-nulls-advanced), ensuring numeric values for every data row. ## Ratio metrics example +These examples demonstrate how to create ratio metrics in your model. They cover basic and advanced use cases, including applying filters to the numerator and denominator metrics. + +#### Example 1 +This example is a basic ratio metric that calculates the ratio of food orders to total orders: + + + ```yaml metrics: - name: food_order_pct @@ -55,6 +65,30 @@ metrics: numerator: food_orders denominator: orders ``` + + +#### Example 2 +This example is a ratio metric that calculates the ratio of food orders to total orders, with a filter and alias applied to the numerator. Note that in order to add these attributes, you'll need to use an explicit key for the name attribute too. + + + +```yaml +metrics: + - name: food_order_pct + description: "The food order count as a ratio of the total order count, filtered by location" + label: Food order ratio by location + type: ratio + type_params: + numerator: + name: food_orders + filter: location = 'New York' + alias: ny_food_orders + denominator: + name: orders + filter: location = 'New York' + alias: ny_orders +``` + ## Ratio metrics using different semantic models @@ -109,6 +143,8 @@ on Users can define constraints on input metrics for a ratio metric by applying a filter directly to the input metric, like so: + + ```yaml metrics: - name: frequent_purchaser_ratio @@ -123,6 +159,7 @@ metrics: denominator: name: distinct_purchasers ``` + Note the `filter` and `alias` parameters for the metric referenced in the numerator. - Use the `filter` parameter to apply a filter to the metric it's attached to. diff --git a/website/docs/docs/build/snapshots.md b/website/docs/docs/build/snapshots.md index c17350ab368..f5321aa626a 100644 --- a/website/docs/docs/build/snapshots.md +++ b/website/docs/docs/build/snapshots.md @@ -52,20 +52,25 @@ It is not possible to "preview data" or "compile sql" for snapshots in dbt Cloud -In dbt Cloud Versionless and dbt Core v1.9 and later, snapshots are configurations defined in YAML files (typically in your snapshots directory). You'll configure your snapshot to tell dbt how to detect record changes. +Configure your snapshots in YAML files to tell dbt how to detect record changes. Define snapshots configurations in YAML files, alongside your models, for a cleaner, faster, and more consistent set up. - + ```yaml snapshots: - - name: orders_snapshot - relation: source('jaffle_shop', 'orders') + - name: string + relation: relation # source('my_source', 'my_table') or ref('my_model') config: - schema: snapshots - database: analytics - unique_key: id - strategy: timestamp - updated_at: updated_at + [database](/reference/resource-configs/database): string + [schema](/reference/resource-configs/schema): string + [alias](/reference/resource-configs/alias): string + [strategy](/reference/resource-configs/strategy): timestamp | check + [unique_key](/reference/resource-configs/unique_key): column_name_or_expression + [check_cols](/reference/resource-configs/check_cols): [column_name] | all + [updated_at](/reference/resource-configs/updated_at): column_name + [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes): true | false + [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names): dictionary + ``` @@ -82,6 +87,7 @@ The following table outlines the configurations available for snapshots: | [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | | [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | | [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source and set `dbt_valid_to` to current time if the record no longer exists | No | True | +| [snapshot_meta_column_names](/reference/resource-configs/snapshot_meta_column_names) | Customize the names of the snapshot meta fields | No | dictionary | - In versions prior to v1.9, the `target_schema` (required) and `target_database` (optional) configurations defined a single schema or database to build a snapshot across users and environment. This created problems when testing or developing a snapshot, as there was no clear separation between development and production environments. In v1.9, `target_schema` became optional, allowing snapshots to be environment-aware. By default, without `target_schema` or `target_database` defined, snapshots now use the `generate_schema_name` or `generate_database_name` macros to determine where to build. Developers can still set a custom location with [`schema`](/reference/resource-configs/schema) and [`database`](/reference/resource-configs/database) configs, consistent with other resource types. - A number of other configurations are also supported (for example, `tags` and `post-hook`). For the complete list, refer to [Snapshot configurations](/reference/snapshot-configs). @@ -160,7 +166,7 @@ To add a snapshot to your project follow these steps. For users on versions 1.8 ### Configuration best practices - + This strategy handles column additions and deletions better than the `check` strategy. @@ -188,9 +194,9 @@ Snapshots can't be rebuilt. Because of this, it's a good idea to put snapshots i - + - If you need to clean or transform your data before snapshotting, create an ephemeral model (or a staging model) that applies the necessary transformations. Then, reference this model in your snapshot configuration. This approach keeps your snapshot definitions clean and allows you to test and run transformations separately. + If you need to clean or transform your data before snapshotting, create an ephemeral model or a staging model that applies the necessary transformations. Then, reference this model in your snapshot configuration. This approach keeps your snapshot definitions clean and allows you to test and run transformations separately. @@ -203,6 +209,8 @@ When you run the [`dbt snapshot` command](/reference/commands/snapshot): - The `dbt_valid_to` column will be updated for any existing records that have changed - The updated record and any new records will be inserted into the snapshot table. These records will now have `dbt_valid_to = null` +Note, these column names can be customized to your team or organizational conventions using the [snapshot_meta_column_names](#snapshot-meta-fields) config. + Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](/reference/dbt-jinja-functions/ref) function. ## Detecting row changes diff --git a/website/docs/docs/cloud-integrations/avail-sl-integrations.md b/website/docs/docs/cloud-integrations/avail-sl-integrations.md index 04d9d55acb4..acc36623ab5 100644 --- a/website/docs/docs/cloud-integrations/avail-sl-integrations.md +++ b/website/docs/docs/cloud-integrations/avail-sl-integrations.md @@ -29,7 +29,7 @@ import AvailIntegrations from '/snippets/_sl-partner-links.md'; - {frontMatter.meta.api_name} to learn how to integrate and query your metrics in downstream tools. - [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) -- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. +- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/explore-data/cells/data-cells/dbt-metrics-cells) to set up SQL cells in Hex. - [Resolve 'Failed APN'](/faqs/Troubleshooting/sl-alpn-error) error when connecting to the dbt Semantic Layer. - [dbt Semantic Layer on-demand course](https://learn.getdbt.com/courses/semantic-layer) - [dbt Semantic Layer FAQs](/docs/use-dbt-semantic-layer/sl-faqs) diff --git a/website/docs/docs/cloud-integrations/configure-auto-exposures.md b/website/docs/docs/cloud-integrations/configure-auto-exposures.md index 24364077614..4574d69c164 100644 --- a/website/docs/docs/cloud-integrations/configure-auto-exposures.md +++ b/website/docs/docs/cloud-integrations/configure-auto-exposures.md @@ -6,7 +6,7 @@ description: "Import and auto-generate exposures from dashboards and understand image: /img/docs/cloud-integrations/auto-exposures/explorer-lineage2.jpg --- -# Configure auto-exposures +# Configure auto-exposures As a data team, it’s critical that you have context into the downstream use cases and users of your data products. [Auto-exposures](/docs/collaborate/auto-exposures) integrates natively with Tableau and [auto-generates downstream lineage](/docs/collaborate/auto-exposures#view-auto-exposures-in-dbt-explorer) in dbt Explorer for a richer experience. diff --git a/website/docs/docs/cloud-integrations/semantic-layer/excel.md b/website/docs/docs/cloud-integrations/semantic-layer/excel.md index 31a028f3d81..c80040dce01 100644 --- a/website/docs/docs/cloud-integrations/semantic-layer/excel.md +++ b/website/docs/docs/cloud-integrations/semantic-layer/excel.md @@ -16,10 +16,11 @@ The dbt Semantic Layer offers a seamless integration with Excel Online and Deskt - You must have a dbt Cloud Team or Enterprise [account](https://www.getdbt.com/pricing). Suitable for both Multi-tenant and Single-tenant deployment. - Single-tenant accounts should contact their account representative for necessary setup and enablement. -import SLCourses from '/snippets/_sl-course.md'; +:::tip - +đź“ą For on-demand video learning, explore the [Querying the Semantic Layer with Excel](https://learn.getdbt.com/courses/querying-the-semantic-layer-with-excel) course to learn how to query metrics with Excel. +::: ## Installing the add-on diff --git a/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md b/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md index cffd034ac33..49e6f90e41f 100644 --- a/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md +++ b/website/docs/docs/cloud-integrations/set-up-snowflake-native-app.md @@ -144,7 +144,7 @@ Check that the SL user has been granted access to the `dbt_sl_llm` schema and ma -If there's been an update to the dbt Cloud account ID, access URL, or API service token, you need to update the configuration for the dbt Snowflake Native App. In Snowflake, navigate to the app's configuration page and delete the existing configurations. Add the new configuration and then run `CALL app_public.restart_ap ();` in the application database in Snowsight. +If there's been an update to the dbt Cloud account ID, access URL, or API service token, you need to update the configuration for the dbt Snowflake Native App. In Snowflake, navigate to the app's configuration page and delete the existing configurations. Add the new configuration and then run `CALL app_public.restart_app();` in the application database in Snowsight. diff --git a/website/docs/docs/cloud/about-cloud-develop-defer.md b/website/docs/docs/cloud/about-cloud-develop-defer.md index fc55edf8a38..3ee5ac71666 100644 --- a/website/docs/docs/cloud/about-cloud-develop-defer.md +++ b/website/docs/docs/cloud/about-cloud-develop-defer.md @@ -40,6 +40,9 @@ To enable defer in the dbt Cloud IDE, toggle the **Defer to production** button For example, if you were to start developing on a new branch with [nothing in your development schema](/reference/node-selection/defer#usage), edit a single model, and run `dbt build -s state:modified` — only the edited model would run. Any `{{ ref() }}` functions will point to the production location of the referenced models. + +Note: The **Defer to staging/production** toggle button doesn't apply when running [dbt Semantic Layer commands](/docs/build/metricflow-commands) in the dbt Cloud IDE. To use defer for Semantic layer commands in the IDE, toggle the button on and manually add the `--defer` flag to the command. This is a temporary workaround and will be available soon. + ### Defer in dbt Cloud CLI diff --git a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md index 02f950111ea..d7afd424fc4 100644 --- a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md +++ b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md @@ -24,7 +24,7 @@ dbt Cloud's [flexible plans](https://www.getdbt.com/pricing/) and features make diff --git a/website/docs/docs/cloud/connect-data-platform/about-connections.md b/website/docs/docs/cloud/connect-data-platform/about-connections.md index 8bec408af2e..6f2f140b724 100644 --- a/website/docs/docs/cloud/connect-data-platform/about-connections.md +++ b/website/docs/docs/cloud/connect-data-platform/about-connections.md @@ -18,6 +18,7 @@ dbt Cloud can connect with a variety of data platform providers including: - [PostgreSQL](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) - [Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) - [Starburst or Trino](/docs/cloud/connect-data-platform/connect-starburst-trino) +- [Teradata](/docs/cloud/connect-data-platform/connect-teradata) You can connect to your database in dbt Cloud by clicking the gear in the top right and selecting **Account Settings**. From the Account Settings page, click **+ New Project**. diff --git a/website/docs/docs/cloud/connect-data-platform/connect-teradata.md b/website/docs/docs/cloud/connect-data-platform/connect-teradata.md new file mode 100644 index 00000000000..cf41814078b --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-teradata.md @@ -0,0 +1,29 @@ +--- +title: "Connect Teradata" +id: connect-teradata +description: "Configure the Teradata platform connection in dbt Cloud." +sidebar_label: "Connect Teradata" +--- + +# Connect Teradata + +Your environment(s) must be on ["Versionless"](/docs/dbt-versions/versionless-cloud) to use the Teradata connection. + +| Field | Description | Type | Required? | Example | +| ----------------------------- | --------------------------------------------------------------------------------------------- | -------------- | --------- | ------- | +| Host | Host name of your Teradata environment. | String | Required | host-name.env.clearscape.teradata.com | +| Port | The database port number. Equivalent to the Teradata JDBC Driver DBS_PORT connection parameter.| Quoted integer | Optional | 1025 | +| Retries | Number of times to retry to connect to database upon error. | Integer | optional | 10 | +| Request timeout | The waiting period between connections attempts in seconds. Default is "1" second. | Quoted integer | Optional | 3 | + + + +### Development and deployment credentials + +| Field | Description | Type | Required? | Example | +| ------------------------------|-----------------------------------------------------------------------------------------------|----------------|-----------|--------------------| +| Username | The database username. Equivalent to the Teradata JDBC Driver USER connection parameter. | String | Required | database_username | +| Password | The database password. Equivalent to the Teradata JDBC Driver PASSWORD connection parameter. | String | Required | DatabasePassword123 | +| Schema | Specifies the initial database to use after login, rather than the user's default database. | String | Required | dbtlabsdocstest | + + diff --git a/website/docs/docs/cloud/dbt-assist-data.md b/website/docs/docs/cloud/dbt-assist-data.md deleted file mode 100644 index ad32c304ca8..00000000000 --- a/website/docs/docs/cloud/dbt-assist-data.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "dbt Assist privacy and data" -sidebar_label: "dbt Assist privacy" -description: "dbt Assist’s powerful AI feature helps you deliver data that works." ---- - -# dbt Assist privacy and data - -dbt Labs is committed to protecting your privacy and data. This page provides information about how dbt Labs handles your data when you use dbt Assist. - -#### Is my data used by dbt Labs to train AI models? - -No, dbt Assist does not use client warehouse data to train any AI models. It uses API calls to an AI provider. - -#### Does dbt Labs share my personal data with third parties - -dbt Labs only shares client personal information as needed to perform the services, under client instructions, or for legal, tax, or compliance reasons. - -#### Does dbt Assist store or use personal data? - -The user clicks the AI assist button, and the user does not otherwise enter data. - -#### Does dbt Assist access my warehouse data? - -dbt Assist utilizes metadata, including column names, model SQL, the model's name, and model documentation. The row-level data from the warehouse is never used or sent to a third-party provider. Such output must be double-checked by the user for completeness and accuracy. - -#### Can dbt Assist data be deleted upon client written request? - -dbt Assist data, aside from usage data, does not persist on dbt Labs systems. Usage data is retained by dbt Labs. dbt Labs does not have possession of any personal or sensitive data. To the extent client identifies personal or sensitive information uploaded by or on behalf of client to dbt Labs systems, such data can be deleted within 30 days of written request. diff --git a/website/docs/docs/cloud/dbt-assist.md b/website/docs/docs/cloud/dbt-assist.md deleted file mode 100644 index bb8cabaff2b..00000000000 --- a/website/docs/docs/cloud/dbt-assist.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: "About dbt Assist" -sidebar_label: "About dbt Assist" -description: "dbt Assist’s powerful AI co-pilot feature helps you deliver data that works." -pagination_next: "docs/cloud/enable-dbt-assist" -pagination_prev: null ---- - -# About dbt Assist - -dbt Assist is a powerful artificial intelligence (AI) co-pilot feature that helps automate development in dbt Cloud, allowing you to focus on delivering data that works. dbt Assist’s AI co-pilot generates [documentation](/docs/build/documentation), [semantic models](/docs/build/semantic-models), and [tests](/docs/build/data-tests) for your SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. - -:::tip Beta feature -dbt Assist is an AI tool meant to _help_ developers generate documentation, semantic models, and tests in dbt Cloud. It's available in beta, in the dbt Cloud IDE only. - -To use dbt Assist, you must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing) and agree to use dbt Labs' OpenAI key. [Register your interest](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) to join the private beta or reach out to your account team to begin this process. -::: - - - -## Feedback - -Please note: Always review AI-generated code and content as it may produce incorrect results. dbt Assist features and/or functionality may be added or eliminated as part of the beta trial. - -To give feedback, please reach out to your dbt Labs account team. We appreciate your feedback and suggestions as we improve dbt Assist. diff --git a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md index 37f39f6dff8..398b0cff2a1 100644 --- a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md +++ b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md @@ -13,7 +13,7 @@ The dbt Cloud integrated development environment (IDE) is a single web-based int The dbt Cloud IDE offers several [keyboard shortcuts](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) and [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) for faster and efficient development and governance: - Syntax highlighting for SQL — Makes it easy to distinguish different parts of your code, reducing syntax errors and enhancing readability. -- AI co-pilot — Use [dbt Assist](/docs/cloud/dbt-assist), a powerful AI co-pilot feature, to generate documentation, semantic models, and tests for your dbt SQL models. +- AI copilot — Use [dbt Copilot](/docs/cloud/dbt-copilot), a powerful AI engine that can generate documentation, tests, and semantic models for your dbt SQL models. - Auto-completion — Suggests table names, arguments, and column names as you type, saving time and reducing typos. - Code [formatting and linting](/docs/cloud/dbt-cloud-ide/lint-format) — Helps standardize and fix your SQL code effortlessly. - Navigation tools — Easily move around your code, jump to specific lines, find and replace text, and navigate between project files. @@ -55,7 +55,7 @@ To understand how to navigate the IDE and its user interface elements, refer to | [**Keyboard shortcuts**](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) | You can access a variety of [commands and actions](/docs/cloud/dbt-cloud-ide/keyboard-shortcuts) in the IDE by choosing the appropriate keyboard shortcut. Use the shortcuts for common tasks like building modified models or resuming builds from the last failure. | | **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches, execute git commands using the git button.
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Link to the repo directly by clicking the branch name
- Edit, format, or lint files and execute dbt commands in your primary protected branch, and commit to a new branch.
- Use Git diff view to view what has been changed in a file before you make a pull request.
- From dbt version 1.6 and higher, use the **Prune branches** [button](/docs/cloud/dbt-cloud-ide/ide-user-interface#prune-branches-modal) to delete local branches that have been deleted from the remote repository, keeping your branch management tidy. | | **Preview and Compile button** | You can [compile or preview](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section) code, a snippet of dbt code, or one of your dbt models after editing and saving. | -| [**dbt Assist**](/docs/cloud/dbt-assist) | A powerful AI co-pilot feature that generates documentation, semantic models, and tests for your dbt SQL models. Available for dbt Cloud Enterprise plans. | +| [**dbt Copilot**](/docs/cloud/dbt-copilot) | A powerful AI engine that can generate documentation, tests, and semantic models for your dbt SQL models. Available for dbt Cloud Enterprise plans. | | **Build, test, and run button** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. | **Command bar** | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking History on the left of the bar. | **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right-clicking on the breadcrumb file. @@ -130,7 +130,7 @@ Nice job, you're ready to start developing and building models 🎉! - Starting from dbt v1.6, leverage [environments variables](/docs/build/environment-variables#special-environment-variables) to dynamically use the Git branch name. For example, using the branch name as a prefix for a development schema. - Run [MetricFlow commands](/docs/build/metricflow-commands) to create and manage metrics in your project with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). -- **Generate your YAML configurations with dbt Assist** — [dbt Assist](/docs/cloud/dbt-assist) is a powerful artificial intelligence (AI) co-pilot feature that helps automate development in dbt Cloud. It generates documentation, semantic models, and tests for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. Available for dbt Cloud Enterprise plans. +- **Generate your YAML configurations with dbt Copilot** — [dbt Copilot](/docs/cloud/dbt-copilot) is a powerful artificial intelligence (AI) feature that helps automate development in dbt Cloud. It can generate documentation, tests, and semantic models for your dbt SQL models directly in the dbt Cloud IDE, with a click of a button, and helps you accomplish more in less time. Available for dbt Cloud Enterprise plans. - **Build and view your project's docs** — The dbt Cloud IDE makes it possible to [build and view](/docs/collaborate/build-and-view-your-docs) documentation for your dbt project while your code is still in development. With this workflow, you can inspect and verify what your project's generated documentation will look like before your changes are released to production. diff --git a/website/docs/docs/cloud/dbt-copilot-data.md b/website/docs/docs/cloud/dbt-copilot-data.md new file mode 100644 index 00000000000..b55681542e3 --- /dev/null +++ b/website/docs/docs/cloud/dbt-copilot-data.md @@ -0,0 +1,29 @@ +--- +title: "dbt Copilot privacy and data" +sidebar_label: "dbt Copilot privacy" +description: "dbt Copilot is a powerful AI engine to help you deliver data that works." +--- + +# dbt Copilot privacy and data + +dbt Labs is committed to protecting your privacy and data. This page provides information about how the dbt Copilot AI engine handles your data. + +#### Is my data used by dbt Labs to train AI models? + +No, dbt Copilot does not use client warehouse data to train any AI models. It uses API calls to an AI provider. + +#### Does dbt Labs share my personal data with third parties + +dbt Labs only shares client personal information as needed to perform the services, under client instructions, or for legal, tax, or compliance reasons. + +#### Does dbt Copilot store or use personal data? + +The user clicks the dbt Copilot button, and the user does not otherwise enter data. + +#### Does dbt Copilot access my warehouse data? + +dbt Copilot utilizes metadata, including column names, model SQL, the model's name, and model documentation. The row-level data from the warehouse is never used or sent to a third-party provider. Such output must be double-checked by the user for completeness and accuracy. + +#### Can dbt Copilot data be deleted upon client written request? + +The data from using dbt Copilot, aside from usage data, _doesn't_ persist on dbt Labs systems. Usage data is retained by dbt Labs. dbt Labs doesn't have possession of any personal or sensitive data. To the extent client identifies personal or sensitive information uploaded by or on behalf of client to dbt Labs systems, such data can be deleted within 30 days of written request. diff --git a/website/docs/docs/cloud/dbt-copilot.md b/website/docs/docs/cloud/dbt-copilot.md new file mode 100644 index 00000000000..403df86a089 --- /dev/null +++ b/website/docs/docs/cloud/dbt-copilot.md @@ -0,0 +1,25 @@ +--- +title: "About dbt Copilot" +sidebar_label: "About dbt Copilot" +description: "dbt Copilot is a powerful AI engine designed to accelerate your analytics workflows throughout your entire ADLC." +pagination_next: "docs/cloud/enable-dbt-copilot" +pagination_prev: null +--- + +# About dbt Copilot + +dbt Copilot is a powerful artificial intelligence (AI) engine that's fully integrated into your dbt Cloud experience and designed to accelerate your analytics workflows. dbt Copilot embeds AI-driven assistance across every stage of the analytics development life cycle (ADLC), empowering data practitioners to deliver data products faster, improve data quality, and enhance data accessibility. With automatic code generation, you can let the AI engine generate the [documentation](/docs/build/documentation), [tests](/docs/build/data-tests), and [semantic models](/docs/build/semantic-models) for you. + +:::tip Beta feature +dbt Copilot is designed to _help_ developers generate documentation, tests, and semantic models in dbt Cloud. It's available in beta, in the dbt Cloud IDE only. + +To use dbt Copilot, you must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing) and either agree to use dbt Labs' OpenAI key or provide your own Open AI API key. [Register here](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) or reach out to the Account Team if you're interested in joining the private beta. +::: + + + +## Feedback + +Please note: Always review AI-generated code and content as it may produce incorrect results. The features and/or functionality of dbt Copilot may be added or eliminated as part of the beta trial. + +To give feedback, please contact your dbt Labs account team. We appreciate your feedback and suggestions as we improve dbt Copilot. diff --git a/website/docs/docs/cloud/enable-dbt-assist.md b/website/docs/docs/cloud/enable-dbt-assist.md deleted file mode 100644 index 9432f858001..00000000000 --- a/website/docs/docs/cloud/enable-dbt-assist.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: "Enable dbt Assist" -sidebar_label: "Enable dbt Assist" -description: "Enable dbt Assist in dbt Cloud and leverage AI to speed up your development." ---- - -# Enable dbt Assist - -This page explains how to enable dbt Assist in dbt Cloud to leverage AI to speed up your development and allow you to focus on delivering quality data. - -## Prerequisites - -- Available in the dbt Cloud IDE only. -- Must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing). -- Development environment be ["Versionless"](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless). -- Current dbt Assist deployments use a central OpenAI API key managed by dbt Labs. In the future, you may provide your own key for Azure OpenAI or OpenAI. -- Accept and sign legal agreements. Reach out to your account team to begin this process. - -## Enable dbt Assist - -dbt Assist will only be available at an account level after your organization has signed the legal requirements. It will be disabled by default. Your dbt Cloud Admin(s) will enable it by following these steps: - -1. Navigate to **Account Settings** in the navigation menu. - -2. Under **Settings**, confirm the account you're enabling. - -3. Click **Edit** in the top right corner. - -4. To turn on dbt Assist, toggle the **Enable account access to AI-powered features** switch to the right. The toggle will slide to the right side, activating dbt Assist. - -5. Click **Save** and you should now have dbt Assist AI enabled to use. - -Note: To disable (only after enabled), repeat steps 1 to 3, toggle off in step 4, and repeat step 5. - - diff --git a/website/docs/docs/cloud/enable-dbt-copilot.md b/website/docs/docs/cloud/enable-dbt-copilot.md new file mode 100644 index 00000000000..07a9f6294da --- /dev/null +++ b/website/docs/docs/cloud/enable-dbt-copilot.md @@ -0,0 +1,51 @@ +--- +title: "Enable dbt Copilot" +sidebar_label: "Enable dbt Copilot" +description: "Enable the dbt Copilot AI engine in dbt Cloud to speed up your development." +--- + +# Enable dbt Copilot + +This page explains how to enable the dbt Copilot engine in dbt Cloud, leveraging AI to speed up your development and allowing you to focus on delivering quality data. + +## Prerequisites + +- Available in the dbt Cloud IDE only. +- Must have an active [dbt Cloud Enterprise account](https://www.getdbt.com/pricing). +- Development environment has been upgraded to ["Versionless"](/docs/dbt-versions/upgrade-dbt-version-in-cloud#versionless). +- By default, dbt Copilot deployments use a central OpenAI API key managed by dbt Labs. Alternatively, you can [provide your own OpenAI API key](#bringing-your-own-openai-api-key-byok). +- Accept and sign legal agreements. Reach out to your Account team to begin this process. + +## Enable dbt Copilot + +dbt Copilot is only available to your account after your organization has signed the required legal documents. It's disabled by default. A dbt Cloud admin can enable it by following these steps: + +1. Navigate to **Account settings** in the navigation menu. + +2. Under **Settings**, confirm the account you're enabling. + +3. Click **Edit** in the top right corner. + +4. Enable the **Enable account access to AI-powered features** option. + +5. Click **Save**. You should now have the dbt Copilot AI engine enabled for use. + +Note: To disable (only after enabled), repeat steps 1 to 3, toggle off in step 4, and repeat step 5. + + + +### Bringing your own OpenAI API key (BYOK) + +Once AI features have been enabled, you can provide your organization's OpenAI API key. dbt Cloud will then leverage your OpenAI account and terms to power dbt CoPilot. This will incur billing charges to your organization from OpenAI for requests made by dbt CoPilot. + +Note that Azure OpenAI is not currently supported, but will be in the future. + +A dbt Cloud admin can provide their API key by following these steps: + +1. Navigate to **Account settings** in the side menu. + +2. Find the **Settings** section and click on **Integrations**. + +3. Scroll to **AI** and select the toggle for **OpenAI** + +4. Enter your API key and click **Save**. \ No newline at end of file diff --git a/website/docs/docs/cloud/use-dbt-assist.md b/website/docs/docs/cloud/use-dbt-assist.md deleted file mode 100644 index 888d5107999..00000000000 --- a/website/docs/docs/cloud/use-dbt-assist.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: "Use dbt Assist" -sidebar_label: "Use dbt Assist" -description: "Use dbt Assist to generate documentation, semantic models, and tests from scratch, giving you the flexibility to modify or fix generated code." ---- - -# Use dbt Assist - -Use dbt Assist to generate documentation, semantic models, and tests from scratch, giving you the flexibility to modify or fix generated code. - -To access and use dbt Assist: - -1. Navigate to the dbt Cloud IDE and select a SQL model file under the **File Explorer**. -2. In the **Console** section (under the **File Editor**), select the **dbt Assist** to view the available AI options. -3. Select the available options to generate the YAML config: **Generate Documentation**, **Generate Tests**, or **Generate Semantic Model**. - - To generate multiple YAML configs for the same model, click each option separately. dbt Assist intelligently saves the YAML config in the same file. -4. Verify the AI-generated code. Update or fix the code if needed. -5. Click **Save** to save the code. You should see the file changes under the **Version control** section. - - diff --git a/website/docs/docs/cloud/use-dbt-copilot.md b/website/docs/docs/cloud/use-dbt-copilot.md new file mode 100644 index 00000000000..30def967f96 --- /dev/null +++ b/website/docs/docs/cloud/use-dbt-copilot.md @@ -0,0 +1,22 @@ +--- +title: "Use dbt Copilot" +sidebar_label: "Use dbt Copilot" +description: "Use the dbt Copilot AI engine to generate documentation, tests, and semantic models from scratch, giving you the flexibility to modify or fix generated code." +--- + +# Use dbt Copilot + +Use dbt Copilot to generate documentation, tests, and semantic models from scratch, giving you the flexibility to modify or fix generated code. To access and use this AI engine: + +1. Navigate to the dbt Cloud IDE and select a SQL model file under the **File Explorer**. + +2. In the **Console** section (under the **File Editor**), click **dbt Copilot** to view the available AI options. + +3. Select the available options to generate the YAML config: **Generate Documentation**, **Generate Tests**, or **Generate Semantic Model**. + - To generate multiple YAML configs for the same model, click each option separately. dbt Copilot intelligently saves the YAML config in the same file. + +4. Verify the AI-generated code. You can update or fix the code as needed. + +5. Click **Save As**. You should see the file changes under the **Version control** section. + + diff --git a/website/docs/docs/collaborate/auto-exposures.md b/website/docs/docs/collaborate/auto-exposures.md index 2b1d649abd1..9b25a2fb305 100644 --- a/website/docs/docs/collaborate/auto-exposures.md +++ b/website/docs/docs/collaborate/auto-exposures.md @@ -7,7 +7,7 @@ pagination_next: "docs/collaborate/data-tile" image: /img/docs/cloud-integrations/auto-exposures/explorer-lineage.jpg --- -# Auto-exposures +# Auto-exposures As a data team, it’s critical that you have context into the downstream use cases and users of your data products. Auto-exposures integrates natively with Tableau (Power BI coming soon) and auto-generates downstream lineage in dbt Explorer for a richer experience. diff --git a/website/docs/docs/collaborate/explore-projects.md b/website/docs/docs/collaborate/explore-projects.md index 9e27c2afa47..a4388a8696e 100644 --- a/website/docs/docs/collaborate/explore-projects.md +++ b/website/docs/docs/collaborate/explore-projects.md @@ -20,7 +20,7 @@ import ExplorerCourse from '/snippets/_explorer-course-link.md'; - You have at least one successful job run in the deployment environment. Note that [CI jobs](/docs/deploy/ci-jobs) do not update dbt Explorer. - You are on the dbt Explorer page. To do this, select **Explore** from the navigation in dbt Cloud. -## Overview page +## Overview page Navigate the dbt Explorer overview page to access your project's resources and metadata. The page includes the following sections: diff --git a/website/docs/docs/collaborate/govern/model-contracts.md b/website/docs/docs/collaborate/govern/model-contracts.md index b07ce909480..d30024157c8 100644 --- a/website/docs/docs/collaborate/govern/model-contracts.md +++ b/website/docs/docs/collaborate/govern/model-contracts.md @@ -178,14 +178,14 @@ Currently, `not_null` and `check` constraints are enforced only after a model is ### Which models should have contracts? Any model meeting the criteria described above _can_ define a contract. We recommend defining contracts for ["public" models](model-access) that are being relied on downstream. -- Inside of dbt: Shared with other groups, other teams, and (in the future) other dbt projects. +- Inside of dbt: Shared with other groups, other teams, and [other dbt projects](/best-practices/how-we-mesh/mesh-1-intro). - Outside of dbt: Reports, dashboards, or other systems & processes that expect this model to have a predictable structure. You might reflect these downstream uses with [exposures](/docs/build/exposures). ### How are contracts different from tests? A model's contract defines the **shape** of the returned dataset. If the model's logic or input data doesn't conform to that shape, the model does not build. -[Data Tests](/docs/build/data-tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the data test. Data tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). +[Data Tests](/docs/build/data-tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the data test. Data tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). In some cases, you can replace a data test with its equivalent constraint. This has the advantage of guaranteeing the validation at build time, and it probably requires less compute (cost) in your data platform. The prerequisites for replacing a data test with a constraint are: - Making sure that your data platform can support and enforce the constraint that you need. Most platforms only enforce `not_null`. diff --git a/website/docs/docs/core/connect-data-platform/athena-setup.md b/website/docs/docs/core/connect-data-platform/athena-setup.md index 9780e86de88..825d3071ad2 100644 --- a/website/docs/docs/core/connect-data-platform/athena-setup.md +++ b/website/docs/docs/core/connect-data-platform/athena-setup.md @@ -7,7 +7,7 @@ meta: github_repo: 'dbt-labs/dbt-athena' pypi_package: 'dbt-athena-community' min_core_version: 'v1.3.0' - cloud_support: Not Supported + cloud_support: Supported min_supported_version: 'engine version 2 and 3' slack_channel_name: '#db-athena' slack_channel_link: 'https://getdbt.slack.com/archives/C013MLFR7BQ' diff --git a/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md b/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md index 8a4d6b61004..0a0347df9ea 100644 --- a/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md +++ b/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md @@ -7,7 +7,7 @@ meta: github_repo: 'Microsoft/dbt-synapse' pypi_package: 'dbt-synapse' min_core_version: 'v0.18.0' - cloud_support: Not Supported + cloud_support: Supported min_supported_version: 'Azure Synapse 10' slack_channel_name: '#db-synapse' slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' diff --git a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md index 1c4d5f387e9..e4e2a91791d 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md +++ b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md @@ -43,15 +43,9 @@ plan="dbt Cloud Team or Enterprise" icon="dbt-bit"/> - - diff --git a/website/docs/docs/dbt-cloud-apis/sl-python-sdk.md b/website/docs/docs/dbt-cloud-apis/sl-python-sdk.md index 901b6bf179a..e34a44a5a57 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-python-sdk.md +++ b/website/docs/docs/dbt-cloud-apis/sl-python-sdk.md @@ -7,7 +7,6 @@ keywords: [dbt Cloud, API, dbt Semantic Layer, python, sdk] sidebar_label: "Python SDK" --- -# Python SDK The [`dbt-sl-sdk` Python software development kit](https://github.com/dbt-labs/semantic-layer-sdk-python) (SDK) is a Python library that provides you with easy access to the dbt Semantic Layer with Python. It allows developers to interact with the dbt Semantic Layer APIs and query metrics and dimensions in downstream tools. ## Installation diff --git a/website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.9.md b/website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.9.md index cf9b9eaed4e..aaa85e4ecef 100644 --- a/website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.9.md +++ b/website/docs/docs/dbt-versions/core-upgrade/06-upgrading-to-v1.9.md @@ -42,7 +42,7 @@ Historically, managing incremental models involved several manual steps and resp While this works for many use-cases, there’s a clear limitation with this approach: *Some datasets are just too big to fit into one query.* -Starting in Core 1.9, you can use the new microbatch strategy to optimize your largest datasets -- **process your event data in discrete periods with their own SQL queries, rather than all at once.** The benefits include: +Starting in Core 1.9, you can use the new [microbatch strategy](/docs/build/incremental-microbatch#what-is-microbatch-in-dbt) to optimize your largest datasets -- **process your event data in discrete periods with their own SQL queries, rather than all at once.** The benefits include: - Simplified query design: Write your model query for a single batch of data. dbt will use your `event_time`, `lookback`, and `batch_size` configurations to automatically generate the necessary filters for you, making the process more streamlined and reducing the need for you to manage these details. - Independent batch processing: dbt automatically breaks down the data to load into smaller batches based on the specified `batch_size` and processes each batch independently, improving efficiency and reducing the risk of query timeouts. If some of your batches fail, you can use `dbt retry` to load only the failed batches. @@ -107,6 +107,6 @@ You can read more about each of these behavior changes in the following links: We also made some quality-of-life improvements in Core 1.9, enabling you to: - Maintain data quality now that dbt returns an an error (versioned models) or warning (unversioned models) when someone [removes a contracted model by deleting, renaming, or disabling](/docs/collaborate/govern/model-contracts#how-are-breaking-changes-handled) it. -- Document [singular data tests](/docs/build/data-tests#document-singular-tests). +- Document [singular data tests](/docs/build/data-tests#singular-data-tests). - Use `ref` and `source` in [foreign key constraints](/reference/resource-properties/constraints). - Use `dbt test` with the `--resource-type` / `--exclude-resource-type` flag, making it possible to include or exclude data tests (`test`) or unit tests (`unit_test`). diff --git a/website/docs/docs/dbt-versions/release-notes.md b/website/docs/docs/dbt-versions/release-notes.md index 96e8a7de37a..662fd0f381a 100644 --- a/website/docs/docs/dbt-versions/release-notes.md +++ b/website/docs/docs/dbt-versions/release-notes.md @@ -20,13 +20,39 @@ Release notes are grouped by month for both multi-tenant and virtual private clo ## October 2024 + + + Documentation for new features and functionality announced at Coalesce 2024: + + - Iceberg table support for [Snowflake](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#iceberg-table-format) + - [Athena](https://docs.getdbt.com/reference/resource-configs/athena-configs) and [Teradata](https://docs.getdbt.com/reference/resource-configs/teradata-configs) adapter support in dbt Cloud + - dbt Cloud now hosted on [Azure](https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses) + - Get comfortable with [Versionless dbt Cloud](https://docs.getdbt.com/docs/dbt-versions/versionless-cloud) + - Scalable [microbatch incremental models](https://docs.getdbt.com/docs/build/incremental-microbatch) + - Advanced CI [features](https://docs.getdbt.com/docs/deploy/advanced-ci) + - [Linting with CI jobs](https://docs.getdbt.com/docs/deploy/continuous-integration#sql-linting) + - dbt Assist is now [dbt Copilot](https://docs.getdbt.com/docs/cloud/dbt-copilot) + - Developer blog on [Snowflake Feature Store and dbt: A bridge between data pipelines and ML](https://docs.getdbt.com/blog/snowflake-feature-store) + - New [Quickstart for dbt Cloud CLI](https://docs.getdbt.com/guides/dbt-cloud-cli?step=1) + - [Auto-exposures with Tableau](https://docs.getdbt.com/docs/collaborate/auto-exposures) + - Semantic Layer integration with [Excel desktop and M365](https://docs.getdbt.com/docs/cloud-integrations/semantic-layer/excel) + - [Data health tiles](https://docs.getdbt.com/docs/collaborate/data-tile) + - [Semantic Layer and Cloud IDE integration](https://docs.getdbt.com/docs/build/metricflow-commands#metricflow-commands) + - Query history in [Explorer](https://docs.getdbt.com/docs/collaborate/model-query-history#view-query-history-in-explorer) + - Semantic Layer Metricflow improvements, including [improved granularity and custom calendar](https://docs.getdbt.com/docs/build/metricflow-time-spine#custom-calendar) + - [Python SDK](https://docs.getdbt.com/docs/dbt-cloud-apis/sl-python) is now generally available + + + + +- **New**: The [dbt Semantic Layer Python software development kit](/docs/dbt-cloud-apis/sl-python) is now [generally available](/docs/dbt-versions/product-lifecycles). It provides users with easy access to the dbt Semantic Layer with Python and enables developers to interact with the dbt Semantic Layer APIs to query metrics/dimensions in downstream tools. - **Enhancement**: You can now add a description to a singular data test in dbt Cloud Versionless. Use the [`description` property](/reference/resource-properties/description) to document [singular data tests](/docs/build/data-tests#singular-data-tests). You can also use [docs block](/docs/build/documentation#using-docs-blocks) to capture your test description. The enhancement will be included in upcoming dbt Core 1.9 release. - **New**: Introducing the [microbatch incremental model strategy](/docs/build/incremental-microbatch) (beta), available in dbt Cloud Versionless and will soon be supported in dbt Core 1.9. The microbatch strategy allows for efficient, batch-based processing of large time-series datasets for improved performance and resiliency, especially when you're working with data that changes over time (like new records being added daily). To enable this feature in dbt Cloud, set the `DBT_EXPERIMENTAL_MICROBATCH` environment variable to `true` in your project. - **New**: The dbt Semantic Layer supports custom calendar configurations in MetricFlow, available in [Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud). Custom calendar configurations allow you to query data using non-standard time periods like `fiscal_year` or `retail_month`. Refer to [custom calendar](/docs/build/metricflow-time-spine#custom-calendar) to learn how to define these custom granularities in your MetricFlow timespine YAML configuration. - **New**: In dbt Cloud Versionless, [Snapshots](/docs/build/snapshots) have been updated to use YAML configuration files instead of SQL snapshot blocks. This new feature simplifies snapshot management and improves performance, and will soon be released in dbt Core 1.9. - Who does this affect? New user on Versionless can define snapshots using the new YAML specification. Users upgrading to Versionless who use snapshots can keep their existing configuration or can choose to migrate their snapshot definitions to YAML. - Users on dbt 1.8 and earlier: No action is needed; existing snapshots will continue to work as before. However, we recommend upgrading to Versionless to take advantage of the new snapshot features. -- **Behavior change:** Set [`state_modified_compare_more_unrendered`](/reference/global-configs/behavior-changes#source-definitions-for-state) to true to reduce false positives for `state:modified` when configs differ between `dev` and `prod` environments. +- **Behavior change:** Set [`state_modified_compare_more_unrendered_values`](/reference/global-configs/behavior-changes#source-definitions-for-state) to true to reduce false positives for `state:modified` when configs differ between `dev` and `prod` environments. - **Behavior change:** Set the [`skip_nodes_if_on_run_start_fails`](/reference/global-configs/behavior-changes#failures-in-on-run-start-hooks) flag to `True` to skip all selected resources from running if there is a failure on an `on-run-start` hook. - **Enhancement**: In dbt Cloud Versionless, snapshots defined in SQL files can now use `config` defined in `schema.yml` YAML files. This update resolves the previous limitation that required snapshot properties to be defined exclusively in `dbt_project.yml` and/or a `config()` block within the SQL file. This will also be released in dbt Core 1.9. - **New**: In dbt Cloud Versionless, the `snapshot_meta_column_names` config allows for customizing the snapshot metadata columns. This feature allows an organization to align these automatically-generated column names with their conventions, and will be included in the upcoming dbt Core 1.9 release. @@ -37,7 +63,7 @@ Release notes are grouped by month for both multi-tenant and virtual private clo ## September 2024 -- **New**: Use dbt Assist's co-pilot feature to generate semantic model for your models, now available in beta. dbt Assist automatically generates documentation, tests, and now semantic models based on the data in your model, . To learn more, refer to [dbt Assist](/docs/cloud/dbt-assist). +- **New**: Use the dbt Copilot AI engine to generate semantic model for your models, now available in beta. dbt Copilot automatically generates documentation, tests, and now semantic models based on the data in your model, . To learn more, refer to [dbt Copilot](/docs/cloud/dbt-copilot). - **New**: Use the new recommended syntax for [defining `foreign_key` constraints](/reference/resource-properties/constraints) using `refs`, available in dbt Cloud Versionless. This will soon be released in dbt Core v1.9. This new syntax will capture dependencies and works across different environments. - **Enhancement**: You can now run [Semantic Layer commands](/docs/build/metricflow-commands) commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). The supported commands are `dbt sl list`, `dbt sl list metrics`, `dbt sl list dimension-values`, `dbt sl list saved-queries`, `dbt sl query`, `dbt sl list dimensions`, `dbt sl list entities`, and `dbt sl validate`. - **New**: Microsoft Excel, a dbt Semantic Layer integration, is now generally available. The integration allows you to connect to Microsoft Excel to query metrics and collaborate with your team. Available for [Excel Desktop](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationId=4132ecd1-425d-982d-efb4-de94ebc83f26) or [Excel Online](https://pages.store.office.com/addinsinstallpage.aspx?assetid=WA200007100&rs=en-US&correlationid=4132ecd1-425d-982d-efb4-de94ebc83f26&isWac=True). For more information, refer to [Microsoft Excel](/docs/cloud-integrations/semantic-layer/excel). @@ -108,7 +134,7 @@ Release notes are grouped by month for both multi-tenant and virtual private clo The following features are new or enhanced as part of our [dbt Cloud Launch Showcase](https://www.getdbt.com/resources/webinars/dbt-cloud-launch-showcase) event on May 14th, 2024: -- **New:** [dbt Assist](/docs/cloud/dbt-assist) is a powerful AI feature helping you generate documentation and tests, saving you time as you deliver high-quality data. Available in private beta for a subset of dbt Cloud Enterprise users and in the dbt Cloud IDE. [Register your interest](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) to join the private beta. +- **New:** [dbt Copilot](/docs/cloud/dbt-copilot) is a powerful AI engine helping you generate documentation, tests, and semantic models, saving you time as you deliver high-quality data. Available in private beta for a subset of dbt Cloud Enterprise users and in the dbt Cloud IDE. [Register your interest](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) to join the private beta. - **New:** The new low-code editor, now in private beta, enables less SQL-savvy analysts to create or edit dbt models through a visual, drag-and-drop experience inside of dbt Cloud. These models compile directly to SQL and are indistinguishable from other dbt models in your projects: they are version-controlled, can be accessed across projects in dbt Mesh, and integrate with dbt Explorer and the Cloud IDE. [Register your interest](https://docs.google.com/forms/d/e/1FAIpQLScPjRGyrtgfmdY919Pf3kgqI5E95xxPXz-8JoVruw-L9jVtxg/viewform) to join the private beta. diff --git a/website/docs/guides/qs-cloud-cli.md b/website/docs/guides/qs-cloud-cli.md new file mode 100644 index 00000000000..1e2a548114f --- /dev/null +++ b/website/docs/guides/qs-cloud-cli.md @@ -0,0 +1,313 @@ +--- +title: "Coalesce: Quickstart for dbt Cloud CLI" +id: "dbt-cloud-cli" +# time_to_complete: '30 minutes' commenting out until we test +level: 'Beginner' +icon: 'guides' +hide_table_of_contents: true +tags: ['Cloud CLI', 'dbt Cloud','Quickstart'] +recently_updated: true +--- + +
+ +## Introduction + +In this quickstart guide, you'll learn how to configure and use dbt Cloud CLI as part of the Coalesce 24 Workshop. + +It will show you how to: + +- Set up a dbt Cloud sandbox. +- Install the dbt Cloud CLI and connect to dbt Cloud. +- Run commands locally using the dbt Cloud CLI. +- Defer to different production environments. +- Leverage cross-project ref. +- Install dbt Power User. +- Use dbt Power User to supercharge development. + +### Prerequisites​ + +- Familiarity with dbt projects and common commands (for example, `dbt build`) +- Git is installed +- An editor, such as Visual Studio Code (preferred), is installed + +### Related content + +- Learn more with [dbt Learn courses](https://learn.getdbt.com) + +## Install Git and Visual Studio Code (Prerequisites) + +You will need to have Git installed locally and a code editor (preferably Visual Studio Code). + +### Check your installation status + +Run `git --version` in your terminal to check if it's installed. For example: + +
+ +
+ +Check your installed applications for Visual Studio Code (vscode) or another editor. For example: + +
+ +
+ +### Install Git and Visual Studio Code + +Navigate to the following Git installation page and install it for your operating system: + +https://git-scm.com/downloads + +Navigate to the following Visual Studio Code installation page and install it for your operating system. + +https://code.visualstudio.com/download + +## Set up dbt Cloud (Coalesce Workshop Only) + +Let's get set up with a dbt Cloud sandbox that's already connected to a Snowflake account for the workshop. + +1. Go to [bit.ly/coalesce-24-sandboxes](https://bit.ly/coalesce-24-sandboxes) to create an account. Make sure you log out of any other dbt Cloud accounts. + + a. Enter your **First Name** and **Last Name** + + b. For **Workshop**, choose **Test driving dbt Cloud CLI and dbt power user** from the dropdown + + c. The **Passcode** will be provided by your facilitators + + d. Accept the terms and click **Complete Registration** + +1. Navigate to the platform project by selecting **Project** form the left sidebar and choosing **Platform Analytics**. + +1. Select **Deploy >> Runs** to find the created jobs. For each job, click on the job and click **run**. + +1. Now repeat for the **Analytics project**. Toggle into the Analytics project. + +1. Select **Deploy >> Runs** to find the created jobs. For the one job, click on the job and click **run**. + +1. Select **Explore** from the navigation and choose XX. Now you can visualize your dbt Mesh. Click into each project to see project level lineage. + +You've now successfully run your project in deployment environments so you can use cross project ref and deferral later in the workshop. + +## Configure dbt Cloud CLI + +Now we'll clone the project repository and configure dbt Cloud CLI to connect to your sandbox. + +### Clone the repo + +1. Navigate to a folder on your computer to clone the repository. + +1. In your terminal, run the following command to clone the downstream (analytics) project: + + ```shell + git clone https://github.com/dbt-labs/c24-workshops-analytics.git + ``` + +### Install Cloud CLI + +1. In dbt Cloud, select Platform Analytics and choose **Develop >> Configure Cloud CLI**. + +1. Based on your current local setup, use the following guidance to determine your installation approach: + + a. Check if you have dbt in your PATH by running `dbt --version` + + b. If you don't have dbt in your PATH, we recommend the macOS or Windows installation method. + + c. If you do have dbt in your PATH (global environment), we recommend: + 1. Uninstalling dbt globally + 2. Installing dbt Cloud CLI with a Python virtual environment + + d. If you have dbt in a virtual environment, install dbt Cloud CLI with a separate Python virtual environment. Be sure to activate it with `source /bin/activate`. + +1. Download the CLI configuration file from the dbt Cloud UI. Save it in your `.dbt` folder. + +1. Navigate to the dbt project folder that you cloned earlier and open the `dbt_project.yml` file with your `project_id`. + +### Confirm the installation + +Run `dbt compile` to verify your installation. + +There you go! You've installed the dbt Cloud CLI! Let's dive in! + +### Additional resources +Consult the following docs if you run into problems when trying to install the dbt Cloud CLI: +- [Install dbt Cloud CLI](https://docs.getdbt.com/docs/cloud/cloud-cli-installation) +- [Configure and use dbt Cloud CLI](https://docs.getdbt.com/docs/cloud/configure-cloud-cli) + +## Leverage dbt Cloud CLI + +Let's run a few commands together to get comfortable with the dbt Cloud CLI: +* `dbt debug` — Displays your connection details and information +* `dbt compile --select stg_campaigns` — Compiles your dbt project +* `dbt run --select stg_campaigns` — Materialized your dbt models +* `dbt run --select stg_campaigns` — Preview the results of a model +* `dbt test --select stg_campaigns` — Execute tests against your materialized models + +Now let's dive into some more advanced components of dbt Cloud CLI. + +### Deferral + +Deferral is a powerful functionality, allowing you to leverage upstream assets that exist outside of your personal development environment. As a result, you can speed up your development workflows and save on warehouse compute costs. Let's run a few commands using deferral: + +1. Run `dbt compile -s stg_campaigns`. Notice how we're able to resolve dependencies in the compiled SQL without seeding `campaigns.csv`. +1. Now let's modify the `stg_campaigns` model by adding a timestamp: + ```sql + current_timestamp() as updated_at + ``` + + Let's build that model with the next command. +1. Run `dbt build --select stg_campaigns`. We're utilizing deferral and the concept of "statefulness" to check with objects that have been modified and resolve dependencies of upstream assets if they exist. + + By default, the dbt Cloud CLI defers to a [Staging](https://docs.getdbt.com/docs/deploy/deploy-environments#staging-environment) environment if one exists. If not, dbt uses the assets from the Production environment. + + To override which environment the dbt Cloud CLI defers to, you can set a `defer-env-id` key in either your `dbt_project.yml` or `dbt_cloud.yml` file. For example: + + ```yml + dbt-cloud: + defer-env-id: '123456' + ``` + +### dbt Mesh + +You have access to cross-project ref's that's powered by the metadata of dbt Cloud. + +1. Open the `agg_campaign_customer_contacts` model. +1. Find the reference called `{{ ref('platform', 'dim_customers', v=1) }}`. +1. Run the command: + + ```shell + dbt run --select agg_campaign_customer_contacts + ``` + +1. Navigate to dbt Cloud Explorer and find a public model. Let's use the `fct_order_items` model. +1. Create a new model called `agg_orders` in your project with the following code: + + ```sql + with orders as ( + + select * from {{ ref('platform', 'fct_order_items') }} + + ), + + final as ( + + select + customer_key as customer_id, + is_return as return_status, + count(*) as count_orders + + from + orders + group by + customer_key, + is_return + ) + + select * from final + ``` + +### Linting and fixing SQL files + +With SQLFluff built in, you can check your code against a style guide and automatically make fixes. + +1. Run the SQLFluff command `lint`: + + ```shell + dbt sqlfluff lint models/staging/campaigns/stg_campaigns.sql --dialect snowflake + ``` + + This identifies tweaks to make in the `stg_campaigns` model. +2. Run the SQLFluff command `fix`: + + ```shell + dbt sqlfluff fix models/staging/campaigns/stg_campaigns.sql --dialect snowflake + ``` + + This attempts to directly make fixes in the `stg_campaigns` model. + +### Change branches + +You can quickly change branches without fully pushing to your Git provider (such as GitHub): + +```shell +git checkout -b my-new-branch + +git checkout main +``` + +Now you've taken a tour of what you can do with dbt Cloud CLI. Let's dive into dbt Power User next. + +## Install dbt Power User + +Let's get dbt Power User installed to supercharge our workflow. + +1. From Visual Studio Code, click on extensions and search for "Power User for dbt". + +
+ +
+1. Click on install. +1. Click **Switch to dbt Cloud**. You might need to refresh. +
+ +
+1. Complete the setup steps. (click on welcome in VSCode and choose dbt Poweruser) +
+ +
+1. Make an account to sign up and get an API Key: https://app.myaltimate.com/register + +1. Copy your API key and enter this into the dbt Power User extension settings. + +Now let's dive in! + +## Leverage dbt Power User + +There is a ton you can do to supercharge your workflow with dbt Cloud. Let's cover some highlights. + +### Preview your upstream/downstream changes + +Open the Power User extension on the left-hand side. You can see the upstream and downstream projects. + +
+ +
+ +### Preview results + +Press Command-Enter (or Control-Enter for Windows) and instantly see the results of your model below. + +
+ +
+ +### SQL visualization + +While looking at a model file, click the Altimate logo in the top right and click **Visualize SQL** to see a breakdown of your SQL model. + +
+ +
+ +### Generate test and documentation YML with user-friendly UX and AI + +At the top of your model file, click on generate documentation for a UI to rapidly create documentation and tests with AI + +
+ +
+ +There is a whole lot more too! Check out the dbt Power User docs here: https://docs.myaltimate.com/ + +## Conclusion + +You've successfully installed dbt Cloud CLI and dbt Power User! Now you can get the benefits of local development _and_ dbt Cloud working together. + +Be on the look out for the following enhancements to dbt Cloud CLI: +- Deeper integration with dbt Explorer for visual interaction +- Support for invoking production jobs directly from the CLI +- Continued optimization for performance and scalability improvements + +
+ + diff --git a/website/docs/guides/teradata-qs.md b/website/docs/guides/teradata-qs.md new file mode 100644 index 00000000000..da951620515 --- /dev/null +++ b/website/docs/guides/teradata-qs.md @@ -0,0 +1,400 @@ +--- +title: "Quickstart for dbt Cloud and Teradata" +id: "teradata" +level: 'Beginner' +icon: 'teradata' +tags: ['dbt Cloud','Quickstart','Teradata'] +hide_table_of_contents: true +--- + +
+ +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with Teradata Vantage. It will show you how to: + +- Create a new Teradata Clearscape instance +- Load sample data into your Teradata Database +- Connect dbt Cloud to Teradata. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add tests to your models. +- Document your models. +- Schedule a job to run. + +:::tip Videos for you +You can check out [dbt Fundamentals](https://learn.getdbt.com/courses/dbt-fundamentals) for free if you're interested in course learning with videos. +::: + +### Prerequisites​ + +- You have a [dbt Cloud account](https://www.getdbt.com/signup/). +- You have access to a Teradata Vantage instance. You can provision one for free at https://clearscape.teradata.com. See [the ClearScape Analytics Experience guide](https://developers.teradata.com/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae/) for details. + +### Related content + +- Learn more with [dbt Learn courses](https://learn.getdbt.com) +- [How we provision Teradata Clearscape Vantage instance](https://developers.teradata.com/quickstarts/get-access-to-vantage/clearscape-analytics-experience/getting-started-with-csae/) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + +## Load data + +The following steps will guide you through how to get the data stored as CSV files in a public S3 bucket and insert it into the tables. + +:::tip SQL IDE + +If you created your Teradata Vantage database instance at https://clearscape.teradata.com and you don't have an SQL IDE handy, use the JupyterLab bundled with your database to execute SQL: + +1. Navigate to [ClearScape Analytics Experience dashboard](https://clearscape.teradata.com/dashboard) and click the **Run Demos** button. The demo will launch JupyterLab. + +2. In JupyterLab, go to **Launcher** by clicking the blue **+** icon in the top left corner. Find the **Notebooks** section and click **Teradata SQL**. + +3. In the notebook's first cell, connect to the database using `connect` magic. You will be prompted to enter your database password when you execute it: + ```ipynb + %connect local + ``` +4. Use additional cells to type and run SQL statements. + +::: + +1. Use your preferred SQL IDE editor to create two databases: `jaffle_shop` and `stripe`: + + ```sql + CREATE DATABASE jaffle_shop AS PERM = 1e9; + CREATE DATABASE stripe AS PERM = 1e9; + ``` + +2. In the databases `jaffle_shop` and `stripe`, create three foreign tables and reference the respective csv files located in object storage: + + ```sql + CREATE FOREIGN TABLE jaffle_shop.customers ( + id integer, + first_name varchar (100), + last_name varchar (100) + ) + USING ( + LOCATION ('/s3/dbt-tutorial-public.s3.amazonaws.com/jaffle_shop_customers.csv') + ) + NO PRIMARY INDEX; + + CREATE FOREIGN TABLE jaffle_shop.orders ( + id integer, + user_id integer, + order_date date, + status varchar(100) + ) + USING ( + LOCATION ('/s3/dbt-tutorial-public.s3.amazonaws.com/jaffle_shop_orders.csv') + ) + NO PRIMARY INDEX; + + CREATE FOREIGN TABLE stripe.payment ( + id integer, + orderid integer, + paymentmethod varchar (100), + status varchar (100), + amount integer, + created date + ) + USING ( + LOCATION ('/s3/dbt-tutorial-public.s3.amazonaws.com/stripe_payments.csv') + ) + NO PRIMARY INDEX; + ``` + +## Connect dbt cloud to Teradata + +1. Create a new project in dbt Cloud. From **Account settings** (using the gear menu in the top right corner), click **New Project**. +2. Enter a project name and click **Continue**. +3. In **Configure your development environment**, click **Add new connection**. +4. Select **Teradata**, fill in all the required details in the **Settings** section, and test the connection. + + + + + +5. Enter your **Development Credentials** for Teradata with: + * **Username** — The username of Teradata database. + * **Password** — The password of Teradata database. + * **Schema** — The default database to use + + + +6. Click **Test Connection** to verify that dbt Cloud can access your Teradata Vantage instance. +7. If the connection test succeeds, click **Next**. If it fails, check your Teradata settings and credentials. + +## Set up a dbt Cloud managed repository + + + +## Initialize your dbt project​ and start developing + +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize your project** to build out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` to create the first commit to your managed repo. Once you’ve created the commit, you can open a branch to add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - Click **Create new file**, add this query to the new file, and click **Save as** to save the new file: + ```sql + select * from jaffle_shop.customers + ``` + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model + +You have two options for working with files in the dbt Cloud IDE: + +- Create a new branch (recommended) — Create a new branch to edit and commit your changes. Navigate to **Version Control** on the left sidebar and click **Create branch**. +- Edit in the protected primary branch — If you prefer to edit, format, lint files, or execute dbt commands directly in your primary git branch. The dbt Cloud IDE prevents commits to the protected branch, so you will receive a prompt to commit your changes to a new branch. + +Name the new branch `add-customers-model`. + +1. Click the **...** next to the `models` directory, then select **Create file**. +2. Name the file `customers.sql`, then click **Create**. +3. Copy the following query into the file and click **Save**. + +```sql + +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from jaffle_shop.customers + +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop.orders + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final + +``` + +4. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +You can connect your business intelligence (BI) tools to these views and tables so they only read cleaned-up data rather than raw data in your BI tool. + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in your original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in your original query. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from jaffle_shop.customers + ``` + + + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop.orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, it created separate views/tables for `stg_customers`, `stg_orders`, and `customers`. dbt inferred the order in which these models should run. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You don’t need to define these dependencies explicitly. + +#### FAQs {#faq-2} + + + + + +## Build models on top of sources + +Sources make it possible to name and describe the data loaded into your warehouse by your extract and load tools. By declaring these tables as sources in dbt, you can: +- Select from source tables in your models using the `{{ source() }}` function, helping define the lineage of your data +- Test your assumptions about your source data +- Calculate the freshness of your source data + +1. Create a new YML file, `models/sources.yml`. +2. Declare the sources by copying the following into the file and clicking **Save**. + + + + ```yml + version: 2 + + sources: + - name: jaffle_shop + description: This is a replica of the Postgres database used by the app + database: raw + schema: jaffle_shop + tables: + - name: customers + description: One record per customer. + - name: orders + description: One record per order. Includes canceled and deleted orders. + ``` + + + +3. Edit the `models/stg_customers.sql` file to select from the `customers` table in the `jaffle_shop` source. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from {{ source('jaffle_shop', 'customers') }} + ``` + + + +4. Edit the `models/stg_orders.sql` file to select from the `orders` table in the `jaffle_shop` source. + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from {{ source('jaffle_shop', 'orders') }} + ``` + + + +5. Execute `dbt run`. + + Your `dbt run` results will be the same as those in the previous step. Your `stg_customers` and `stg_orders` + models will still query from the same raw data source in Teradata. By using `source`, you can + test and document your raw data and also understand the lineage of your sources. + + +
+ + + + diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md index c38cc2768e1..b8998dba261 100644 --- a/website/docs/reference/artifacts/dbt-artifacts.md +++ b/website/docs/reference/artifacts/dbt-artifacts.md @@ -22,7 +22,7 @@ dbt has produced artifacts since the release of dbt-docs in v0.11.0. Starting in ### When are artifacts produced? Most dbt commands (and corresponding RPC methods) produce artifacts: -- [semantic manifest](/docs/dbt-cloud-apis/sl-manifest): produced whenever your dbt project is parsed +- [semantic manifest](/reference/artifacts/sl-manifest): produced whenever your dbt project is parsed - [manifest](/reference/artifacts/manifest-json): produced by commands that read and understand your project - [run results](/reference/artifacts/run-results-json): produced by commands that run, compile, or catalog nodes in your DAG - [catalog](catalog-json): produced by `docs generate` diff --git a/website/docs/reference/artifacts/other-artifacts.md b/website/docs/reference/artifacts/other-artifacts.md index 0216acccff0..e37662ae28c 100644 --- a/website/docs/reference/artifacts/other-artifacts.md +++ b/website/docs/reference/artifacts/other-artifacts.md @@ -39,7 +39,7 @@ Each of those points in time contains the `name` and `type` of each node and `su ### semantic_manifest.json -The [`semantic_manifest.json`](/docs/dbt-cloud-apis/sl-manifest) file is useful as an internal interface between `dbt-core` and MetricFlow. As such, it functions as a behind-the-scenes bridge for interaction between the two systems. You can find all of the `semantic_manifest.json` information in the [`semantic_manifest.json`](/docs/dbt-cloud-apis/sl-manifest). +The [`semantic_manifest.json`](/reference/artifacts/sl-manifest) file is useful as an internal interface between `dbt-core` and MetricFlow. As such, it functions as a behind-the-scenes bridge for interaction between the two systems. You can find all of the `semantic_manifest.json` information in the [`semantic_manifest.json`](/reference/artifacts/sl-manifest). There are two reasons why `semantic_manifest.json` exists alongside `manifest.json`: diff --git a/website/docs/docs/dbt-cloud-apis/sl-manifest.md b/website/docs/reference/artifacts/sl-manifest.md similarity index 90% rename from website/docs/docs/dbt-cloud-apis/sl-manifest.md rename to website/docs/reference/artifacts/sl-manifest.md index d5bcf5a6774..03e661841c4 100644 --- a/website/docs/docs/dbt-cloud-apis/sl-manifest.md +++ b/website/docs/reference/artifacts/sl-manifest.md @@ -7,26 +7,24 @@ sidebar_label: "Semantic manifest" pagination_next: null --- +**Produced by:** Any command that parses your project. This includes all commands _except_ [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), and [`init`](/reference/commands/init). + dbt creates an [artifact](/reference/artifacts/dbt-artifacts) file called the _Semantic Manifest_ (`semantic_manifest.json`), which MetricFlow requires to build and run metric queries properly for the dbt Semantic Layer. This artifact contains comprehensive information about your dbt Semantic Layer. It is an internal file that acts as the integration point with MetricFlow. By using the semantic manifest produced by dbt Core, MetricFlow will instantiate a data flow plan and generate SQL from Semantic Layer query requests. It's a valuable reference that you can use to understand the structure and details of your data models. Similar to the [`manifest.json` file](/reference/artifacts/manifest-json), the `semantic_manifest.json` file also lives in the [target directory](/reference/global-configs/json-artifacts) of your dbt project where dbt stores various artifacts (such as compiled models and tests) generated during the execution of your project. -## How it's produced - -Just like `manifest.json`, the `semantic_manifest.json` is produced whenever your dbt project is parsed. All dbt commands will parse your project and create a `semantic_manifest.json` file, _except_ [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), and [`init`](/reference/commands/init). - - -## Top level keys +## Top-level keys Top-level keys for the semantic manifest are: - `semantic_models` — Starting points of data with entities, dimensions, and measures, and correspond to models in your dbt project. - `metrics` — Functions combining measures, constraints, and so on to define quantitative indicators. - `project_configuration` — Contains information around your project configurations -
-Example target/semantic_manifest.json file +### Example + + ```json { @@ -112,7 +110,7 @@ Top-level keys for the semantic manifest are: } ``` -
+ ## Related docs diff --git a/website/docs/reference/global-configs/behavior-changes.md b/website/docs/reference/global-configs/behavior-changes.md index f7c2344ae05..d35b83765e3 100644 --- a/website/docs/reference/global-configs/behavior-changes.md +++ b/website/docs/reference/global-configs/behavior-changes.md @@ -69,7 +69,7 @@ When we use dbt Cloud in the following table, we're referring to accounts that h | source_freshness_run_project_hooks | 2024.03 | TBD* | 1.8.0 | 1.9.0 | | [Redshift] [restrict_direct_pg_catalog_access](/reference/global-configs/redshift-changes#the-restrict_direct_pg_catalog_access-flag) | 2024.09 | TBD* | dbt-redshift v1.9.0 | 1.9.0 | | skip_nodes_if_on_run_start_fails | 2024.10 | TBD* | 1.9.0 | TBD* | -| state_modified_compare_more_unrendered | 2024.10 | TBD* | 1.9.0 | TBD* | +| state_modified_compare_more_unrendered_values | 2024.10 | TBD* | 1.9.0 | TBD* | When the dbt Cloud Maturity is "TBD," it means we have not yet determined the exact date when these flags' default values will change. Affected users will see deprecation warnings in the meantime, and they will receive emails providing advance warning ahead of the maturity date. In the meantime, if you are seeing a deprecation warning, you can either: - Migrate your project to support the new behavior, and then set the flag to `True` to stop seeing the warnings. @@ -85,7 +85,7 @@ Set the `skip_nodes_if_on_run_start_fails` flag to `True` to skip all selected r The flag is `False` by default. -Set `state_modified_compare_more_unrendered` to `True` to reduce false positives during `state:modified` checks (especially when configs differ by target environment like `prod` vs. `dev`). +Set `state_modified_compare_more_unrendered_values` to `True` to reduce false positives during `state:modified` checks (especially when configs differ by target environment like `prod` vs. `dev`). Setting the flag to `True` changes the `state:modified` comparison from using rendered values to unrendered values instead. It accomplishes this by persisting `unrendered_config` during model parsing and `unrendered_database` and `unrendered_schema` configs during source parsing. diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md index 99dbea401b3..863494de12e 100644 --- a/website/docs/reference/node-selection/defer.md +++ b/website/docs/reference/node-selection/defer.md @@ -31,7 +31,7 @@ dbt test --models [...] --defer --state path/to/artifacts When the `--defer` flag is provided, dbt will resolve `ref` calls differently depending on two criteria: 1. Is the referenced node included in the model selection criteria of the current run? -2. Does the reference node exist as a database object in the current environment? +2. Does the referenced node exist as a database object in the current environment? If the answer to both is **no**—a node is not included _and_ it does not exist as a database object in the current environment—references to it will use the other namespace instead, provided by the state manifest. @@ -71,8 +71,6 @@ group by 1 I want to test my changes. Nothing exists in my development schema, `dev_alice`. -### test - +### test + I also have a `relationships` test that establishes referential integrity between `model_a` and `model_b`: diff --git a/website/docs/reference/node-selection/state-comparison-caveats.md b/website/docs/reference/node-selection/state-comparison-caveats.md index 4d5593b7331..25301656539 100644 --- a/website/docs/reference/node-selection/state-comparison-caveats.md +++ b/website/docs/reference/node-selection/state-comparison-caveats.md @@ -46,7 +46,7 @@ dbt test -s "state:modified" --exclude "test_name:relationships" -To reduce false positives during `state:modified` selection due to env-aware logic, you can set the `state_modified_compare_more_unrendered` [behavior flag](/reference/global-configs/behavior-changes#behavior-change-flags) to `True`. +To reduce false positives during `state:modified` selection due to env-aware logic, you can set the `state_modified_compare_more_unrendered_values` [behavior flag](/reference/global-configs/behavior-changes#behavior-change-flags) to `True`. @@ -54,7 +54,7 @@ To reduce false positives during `state:modified` selection due to env-aware log State comparison works by identifying discrepancies between two manifests. Those discrepancies could be the result of: 1. Changes made to a project in development -2. Env-aware logic that causes different behavior based on the `target`, env vars, etc., which can be avoided if you upgrade to dbt Core 1.9 and set the `state_modified_compare_more_unrendered` [behavior flag](/reference/global-configs/behavior-changes#behavior-change-flags) to `True`. +2. Env-aware logic that causes different behavior based on the `target`, env vars, etc., which can be avoided if you upgrade to dbt Core 1.9 and set the `state_modified_compare_more_unrendered_values` [behavior flag](/reference/global-configs/behavior-changes#behavior-change-flags) to `True`. State comparison detects env-aware config in `dbt_project.yml`. This target-based config won't register as a modification: diff --git a/website/docs/reference/resource-configs/bigquery-configs.md b/website/docs/reference/resource-configs/bigquery-configs.md index a6f3036ede8..b943f114861 100644 --- a/website/docs/reference/resource-configs/bigquery-configs.md +++ b/website/docs/reference/resource-configs/bigquery-configs.md @@ -21,7 +21,7 @@ This will allow you to read and write from multiple BigQuery projects. Same for ### Partition clause -BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#pruning_limiting_partitions) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). +BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#use_a_constant_filter_expression) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). The `partition_by` config can be supplied as a dictionary with the following format: @@ -265,7 +265,7 @@ If your model has `partition_by` configured, you may optionally specify two addi -### Clustering Clause +### Clustering clause BigQuery tables can be [clustered](https://cloud.google.com/bigquery/docs/clustered-tables) to colocate related data. @@ -286,7 +286,7 @@ select * from ... -Clustering on a multiple columns: +Clustering on multiple columns: @@ -303,11 +303,11 @@ select * from ... -## Managing KMS Encryption +## Managing KMS encryption [Customer managed encryption keys](https://cloud.google.com/bigquery/docs/customer-managed-encryption) can be configured for BigQuery tables using the `kms_key_name` model configuration. -### Using KMS Encryption +### Using KMS encryption To specify the KMS key name for a model (or a group of models), use the `kms_key_name` model configuration. The following example sets the `kms_key_name` for all of the models in the `encrypted/` directory of your dbt project. @@ -328,7 +328,7 @@ models: -## Labels and Tags +## Labels and tags ### Specifying labels @@ -373,8 +373,6 @@ models: - - ### Specifying tags @@ -434,7 +432,7 @@ The `incremental_strategy` config can be set to one of two values: ### Performance and cost The operations performed by dbt while building a BigQuery incremental model can -be made cheaper and faster by using [clustering keys](#clustering-keys) in your +be made cheaper and faster by using a [clustering clause](#clustering-clause) in your model configuration. See [this guide](https://discourse.getdbt.com/t/benchmarking-incremental-strategies-on-bigquery/981) for more information on performance tuning for BigQuery incremental models. **Note:** These performance and cost benefits are applicable to incremental models @@ -673,7 +671,7 @@ select ... -## Authorized Views +## Authorized views If the `grant_access_to` config is specified for a model materialized as a view, dbt will grant the view model access to select from the list of datasets diff --git a/website/docs/reference/resource-configs/firebolt-configs.md b/website/docs/reference/resource-configs/firebolt-configs.md index 394823e33de..0ab14354003 100644 --- a/website/docs/reference/resource-configs/firebolt-configs.md +++ b/website/docs/reference/resource-configs/firebolt-configs.md @@ -38,8 +38,8 @@ models: +table_type: fact +primary_index: [ , ... ] +indexes: - - type: aggregating - key_column: [ , ... ] + - index_type: aggregating + key_columns: [ , ... ] aggregation: [ , ... ] ... ``` @@ -58,8 +58,8 @@ models: table_type: fact primary_index: [ , ... ] indexes: - - type: aggregating - key_column: [ , ... ] + - index_type: aggregating + key_columns: [ , ... ] aggregation: [ , ... ] ... ``` @@ -77,9 +77,9 @@ models: primary_index = [ "", ... ], indexes = [ { - type = "aggregating" - key_column = [ "", ... ], - aggregation = [ "", ... ], + "index_type": "aggregating" + "key_columns": [ "", ... ], + "aggregation": [ "", ... ], }, ... ] @@ -99,8 +99,8 @@ models: | `table_type` | Whether the materialized table will be a [fact or dimension](https://docs.firebolt.io/godocs/Overview/working-with-tables/working-with-tables.html#fact-and-dimension-tables) table. | | `primary_index` | Sets the primary index for the fact table using the inputted list of column names from the model. Required for fact tables. | | `indexes` | A list of aggregating indexes to create on the fact table. | -| `type` | Specifies that the index is an [aggregating index](https://docs.firebolt.io/godocs/Guides/working-with-indexes/using-aggregating-indexes.html). Should be set to `aggregating`. | -| `key_column` | Sets the grouping of the aggregating index using the inputted list of column names from the model. | +| `index_type` | Specifies that the index is an [aggregating index](https://docs.firebolt.io/godocs/Guides/working-with-indexes/using-aggregating-indexes.html). Should be set to `aggregating`. | +| `key_columns` | Sets the grouping of the aggregating index using the inputted list of column names from the model. | | `aggregation` | Sets the aggregations on the aggregating index using the inputted list of SQL agg expressions. | @@ -113,9 +113,9 @@ models: primary_index = "id", indexes = [ { - type: "aggregating", - key_column: "order_id", - aggregation: ["COUNT(DISTINCT status)", "AVG(customer_id)"] + "index_type": "aggregating", + "key_columns": "order_id", + "aggregation": ["COUNT(DISTINCT status)", "AVG(customer_id)"] } ] ) }} diff --git a/website/docs/reference/resource-properties/constraints.md b/website/docs/reference/resource-properties/constraints.md index 948fe223d68..63582974040 100644 --- a/website/docs/reference/resource-properties/constraints.md +++ b/website/docs/reference/resource-properties/constraints.md @@ -15,7 +15,7 @@ Constraints require the declaration and enforcement of a model [contract](/refer Constraints may be defined for a single column, or at the model level for one or more columns. As a general rule, we recommend defining single-column constraints directly on those columns. -If you are defining multiple `primary_key` constraints for a single model, those _must_ be defined at the model level. Defining multiple `primary_key` constraints at the column level is not supported. +If you define multiple `primary_key` constraints for a single model, those _must_ be defined at the model level. Defining multiple `primary_key` constraints at the column level is not supported. The structure of a constraint is: - `type` (required): one of `not_null`, `unique`, `primary_key`, `foreign_key`, `check`, `custom` @@ -47,7 +47,7 @@ models: columns: [first_column, second_column, ...] - type: foreign_key # multi_column columns: [first_column, second_column, ...] - to: "{{ ref('other_model_name') }}" + to: ref('other_model_name') to_columns: [other_model_first_column, other_model_second_columns, ...] - type: check columns: [first_column, second_column, ...] @@ -64,7 +64,7 @@ models: - type: not_null - type: unique - type: foreign_key - to: "{{ ref('other_model_name') }}" + to: ref('other_model_name') to_columns: other_model_column - type: ... ``` @@ -572,3 +572,73 @@ alter table schema_name.my_model add constraint 472394792387497234 check (id > 0 + +## Custom constraints + +In dbt Cloud and dbt Core, you can use custom constraints on models for the advanced configuration of tables. Different data warehouses support different syntax and capabilities. + +Custom constraints allow you to add configuration to specific columns. For example: + + - Set [masking policies](https://docs.snowflake.com/en/user-guide/security-column-intro#what-are-masking-policies) in Snowflake when using a Create Table As Select (CTAS). + + - Other data warehouses (such as [Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-create-table-using.html) and [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#column_name_and_column_schema) have their own set of parameters that can be set for columns in their CTAS statements. + + +You can implement constraints in a couple of different ways: + + + +Here's an example of how to implement tag-based masking policies with contracts and constraints using the following syntax: + + + +```yaml + +models: + - name: my_model + config: + contract: + enforced: true + materialized: table + columns: + - name: id + data_type: int + constraints: + - type: custom + expression: "tag (my_tag = 'my_value')" # A custom SQL expression used to enforce a specific constraint on a column. + +``` + + + +Using this syntax requires configuring all the columns and their types as it’s the only way to send a create or replace ` mytable as ...`. It’s not possible to do it with just a partial list of columns. This means making sure the columns and constraints fields are fully defined. + +To generate a YAML with all the columns, you can use `generate_model_yaml` from [dbt-codegen](https://github.com/dbt-labs/dbt-codegen/tree/0.12.1/?tab=readme-ov-file#generate_model_yaml-source). + + + + +Alternatively, you can add a masking policy without tags: + + + +```yaml + +models: + - name: my_model + config: + contract: + enforced: true + materialized: table + columns: + - name: id + data_type: int + constraints: + - type: custom + expression: "masking policy my_policy" + +``` + + + + diff --git a/website/docs/reference/resource-properties/deprecation_date.md b/website/docs/reference/resource-properties/deprecation_date.md index be76ccb07f6..70f150dc465 100644 --- a/website/docs/reference/resource-properties/deprecation_date.md +++ b/website/docs/reference/resource-properties/deprecation_date.md @@ -53,11 +53,11 @@ Additionally, [`WARN_ERROR_OPTIONS`](/reference/global-configs/warnings) gives a |--------------------------------|----------------------------------------------------|------------------------| | `DeprecatedModel` | Parsing a project that defines a deprecated model | Producer | | `DeprecatedReference` | Referencing a model with a past deprecation date | Producer and consumers | -| `UpcomingDeprecationReference` | Referencing a model with a future deprecation date | Producer and consumers | +| `UpcomingReferenceDeprecation` | Referencing a model with a future deprecation date | Producer and consumers | ** Example ** -Example output for an `UpcomingDeprecationReference` warning: +Example output for an `UpcomingReferenceDeprecation` warning: ``` $ dbt parse 15:48:14 Running with dbt=1.6.0 diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 82eb6df54f4..f43420eb11f 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -72,19 +72,20 @@ var siteSettings = { }, announcementBar: { id: "biweekly-demos", - content: "Register now for Coalesce 2024 ✨ The Analytics Engineering Conference!", - backgroundColor: "#7444FD", + content: "Join our biweekly demos and see dbt Cloud in action!", + backgroundColor: "#047377", textColor: "#fff", isCloseable: true, }, announcementBarActive: true, - announcementBarLink: "https://coalesce.getdbt.com/register/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2025_coalesce-2024_aw&utm_content=coalesce____&utm_term=all_all__", + announcementBarLink: + "https://www.getdbt.com/resources/webinars/dbt-cloud-demos-with-experts/?utm_medium=i[…]ly-demos_aw&utm_content=biweekly-demos____&utm_term=all_all__", // Set community spotlight member on homepage // This is the ID for a specific file under docs/community/spotlight communitySpotlightMember: "meagan-palmer", prism: { theme: (() => { - var theme = themes.nightOwl; + var theme = themes.nightOwl; // Add additional rule to nightowl theme in order to change // the color of YAML keys (to be different than values). // There weren't many Prism themes that differentiated @@ -200,6 +201,12 @@ var siteSettings = { links: [ { html: ` + + diff --git a/website/snippets/_auto-exposures-view.md b/website/snippets/_auto-exposures-view.md index 95f81782cab..d30b47ae21d 100644 --- a/website/snippets/_auto-exposures-view.md +++ b/website/snippets/_auto-exposures-view.md @@ -1,4 +1,4 @@ -## View auto-exposures in dbt Explorer +## View auto-exposures in dbt Explorer After setting up auto-exposures in dbt Cloud, you can view them in dbt Explorer for a richer experience: 1. Navigate to dbt Explorer by clicking on the **Explore** link in the navigation. diff --git a/website/snippets/_sl-course.md b/website/snippets/_sl-course.md index 6be9ec7e959..1400be91f37 100644 --- a/website/snippets/_sl-course.md +++ b/website/snippets/_sl-course.md @@ -3,7 +3,7 @@ Explore our [dbt Semantic Layer on-demand course](https://learn.getdbt.com/courses/semantic-layer) to learn how to define and query metrics in your dbt project. -Additionally, dive into mini-courses for querying the dbt Semantic Layer in your favorite tools: [Tableau](https://courses.getdbt.com/courses/tableau-querying-the-semantic-layer), [Hex](https://courses.getdbt.com/courses/hex-querying-the-semantic-layer), and [Mode](https://courses.getdbt.com/courses/mode-querying-the-semantic-layer). +Additionally, dive into mini-courses for querying the dbt Semantic Layer in your favorite tools: [Tableau](https://courses.getdbt.com/courses/tableau-querying-the-semantic-layer), [Excel](https://learn.getdbt.com/courses/querying-the-semantic-layer-with-excel), [Hex](https://courses.getdbt.com/courses/hex-querying-the-semantic-layer), and [Mode](https://courses.getdbt.com/courses/mode-querying-the-semantic-layer). diff --git a/website/snippets/_sl-partner-links.md b/website/snippets/_sl-partner-links.md index 28e4dc24b39..aaefcc77747 100644 --- a/website/snippets/_sl-partner-links.md +++ b/website/snippets/_sl-partner-links.md @@ -54,9 +54,9 @@ The following tools integrate with the dbt Semantic Layer: - @@ -68,9 +68,9 @@ The following tools integrate with the dbt Semantic Layer: - @@ -82,9 +82,9 @@ The following tools integrate with the dbt Semantic Layer: - diff --git a/website/snippets/_sl-run-prod-job.md b/website/snippets/_sl-run-prod-job.md index f820b7f3f79..318b8d27cbf 100644 --- a/website/snippets/_sl-run-prod-job.md +++ b/website/snippets/_sl-run-prod-job.md @@ -6,7 +6,7 @@ This section explains how you can perform a job run in your deployment environme 3. To create a new environment, navigate to **Deploy** in the navigation menu, select **Environments**, and then select **Create new environment**. 4. Fill in your deployment credentials with your Snowflake username and password. You can name the schema anything you want. Click **Save** to create your new production environment. 5. [Create a new deploy job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) that runs in the environment you just created. Go back to the **Deploy** menu, select **Jobs**, select **Create job**, and click **Deploy job**. -6. Set the job to run a `dbt parse` job to parse your projects and generate a [`semantic_manifest.json` artifact](/docs/dbt-cloud-apis/sl-manifest) file. Although running `dbt build` isn't required, you can choose to do so if needed. +6. Set the job to run a `dbt parse` job to parse your projects and generate a [`semantic_manifest.json` artifact](/reference/artifacts/sl-manifest) file. Although running `dbt build` isn't required, you can choose to do so if needed. 7. Run the job by clicking the **Run now** button. Monitor the job's progress in real-time through the **Run summary** tab. Once the job completes successfully, your dbt project, including the generated documentation, will be fully deployed and available for use in your production environment. If any issues arise, review the logs to diagnose and address any errors. diff --git a/website/snippets/_snapshot-yaml-spec.md b/website/snippets/_snapshot-yaml-spec.md index 8bbdc6be72e..cb1675ce5bd 100644 --- a/website/snippets/_snapshot-yaml-spec.md +++ b/website/snippets/_snapshot-yaml-spec.md @@ -1,4 +1,6 @@ :::info Use the latest snapshot syntax -In Versionless and dbt v1.9 and later, snapshots are defined in an updated syntax using a YAML file within your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). For faster and more efficient management, consider the updated snapshot YAML syntax, [available in Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core). +In [dbt Cloud Versionless](/docs/dbt-versions/versionless-cloud) or [dbt Core v1.9 and later](/docs/dbt-versions/core), you can configure snapshots in YAML files using the updated syntax within your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)). + +This syntax allows for faster, more efficient snapshot management. To use it, upgrade to Versionless or dbt v1.9 or newer. ::: diff --git a/website/static/img/blog/2024-10-04-iceberg-blog/2024-10-03-iceberg-support.png b/website/static/img/blog/2024-10-04-iceberg-blog/2024-10-03-iceberg-support.png new file mode 100644 index 00000000000..2b99378fa84 Binary files /dev/null and b/website/static/img/blog/2024-10-04-iceberg-blog/2024-10-03-iceberg-support.png differ diff --git a/website/static/img/blog/2024-10-04-iceberg-blog/iceberg_materialization.png b/website/static/img/blog/2024-10-04-iceberg-blog/iceberg_materialization.png new file mode 100644 index 00000000000..c20e7855858 Binary files /dev/null and b/website/static/img/blog/2024-10-04-iceberg-blog/iceberg_materialization.png differ diff --git a/website/static/img/blog/authors/luis-leon.png b/website/static/img/blog/authors/luis-leon.png new file mode 100644 index 00000000000..ce3c09784ba Binary files /dev/null and b/website/static/img/blog/authors/luis-leon.png differ diff --git a/website/static/img/blog/authors/randy-pettus.png b/website/static/img/blog/authors/randy-pettus.png new file mode 100644 index 00000000000..e3468d9aca7 Binary files /dev/null and b/website/static/img/blog/authors/randy-pettus.png differ diff --git a/website/static/img/blog/example-features-produced.png b/website/static/img/blog/example-features-produced.png new file mode 100644 index 00000000000..4aaa34cf3e9 Binary files /dev/null and b/website/static/img/blog/example-features-produced.png differ diff --git a/website/static/img/blog/example-snowflake-ui.png b/website/static/img/blog/example-snowflake-ui.png new file mode 100644 index 00000000000..86c3394bcd0 Binary files /dev/null and b/website/static/img/blog/example-snowflake-ui.png differ diff --git a/website/static/img/blog/example-training-data-set.png b/website/static/img/blog/example-training-data-set.png new file mode 100644 index 00000000000..085b2785f06 Binary files /dev/null and b/website/static/img/blog/example-training-data-set.png differ diff --git a/website/static/img/cloud-cli-guide/finder-vscode-check.png b/website/static/img/cloud-cli-guide/finder-vscode-check.png new file mode 100644 index 00000000000..ab303c00c3a Binary files /dev/null and b/website/static/img/cloud-cli-guide/finder-vscode-check.png differ diff --git a/website/static/img/cloud-cli-guide/setup-poweruser-01.png b/website/static/img/cloud-cli-guide/setup-poweruser-01.png new file mode 100644 index 00000000000..e750bc34ed7 Binary files /dev/null and b/website/static/img/cloud-cli-guide/setup-poweruser-01.png differ diff --git a/website/static/img/cloud-cli-guide/setup-poweruser-02.png b/website/static/img/cloud-cli-guide/setup-poweruser-02.png new file mode 100644 index 00000000000..3ddb52c8407 Binary files /dev/null and b/website/static/img/cloud-cli-guide/setup-poweruser-02.png differ diff --git a/website/static/img/cloud-cli-guide/setup-poweruser-03.png b/website/static/img/cloud-cli-guide/setup-poweruser-03.png new file mode 100644 index 00000000000..c7baa1b9984 Binary files /dev/null and b/website/static/img/cloud-cli-guide/setup-poweruser-03.png differ diff --git a/website/static/img/cloud-cli-guide/terminal-git-check.png b/website/static/img/cloud-cli-guide/terminal-git-check.png new file mode 100644 index 00000000000..59ab886b47e Binary files /dev/null and b/website/static/img/cloud-cli-guide/terminal-git-check.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-01.png b/website/static/img/cloud-cli-guide/using-poweruser-01.png new file mode 100644 index 00000000000..f24a7ac89d2 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-01.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-02.png b/website/static/img/cloud-cli-guide/using-poweruser-02.png new file mode 100644 index 00000000000..4724540de13 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-02.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-03.png b/website/static/img/cloud-cli-guide/using-poweruser-03.png new file mode 100644 index 00000000000..ab28a8d72b0 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-03.png differ diff --git a/website/static/img/cloud-cli-guide/using-poweruser-04.png b/website/static/img/cloud-cli-guide/using-poweruser-04.png new file mode 100644 index 00000000000..7d72f4a97e7 Binary files /dev/null and b/website/static/img/cloud-cli-guide/using-poweruser-04.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/dbt-copilot-doc.gif b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-copilot-doc.gif new file mode 100644 index 00000000000..cca8db37a0a Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/dbt-copilot-doc.gif differ diff --git a/website/static/img/docs/dbt-cloud/defer-toggle.jpg b/website/static/img/docs/dbt-cloud/defer-toggle.jpg index fdeb27c4b71..3c3abca0fc2 100644 Binary files a/website/static/img/docs/dbt-cloud/defer-toggle.jpg and b/website/static/img/docs/dbt-cloud/defer-toggle.jpg differ diff --git a/website/static/img/docs/dbt-cloud/teradata-connection.png b/website/static/img/docs/dbt-cloud/teradata-connection.png new file mode 100644 index 00000000000..fd2837c16ec Binary files /dev/null and b/website/static/img/docs/dbt-cloud/teradata-connection.png differ diff --git a/website/static/img/docs/dbt-cloud/teradata-deployment.png b/website/static/img/docs/dbt-cloud/teradata-deployment.png new file mode 100644 index 00000000000..e5f2b6986e0 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/teradata-deployment.png differ diff --git a/website/static/img/teradata/dbt_cloud_teradata_account_settings.png b/website/static/img/teradata/dbt_cloud_teradata_account_settings.png new file mode 100644 index 00000000000..c7de2425023 Binary files /dev/null and b/website/static/img/teradata/dbt_cloud_teradata_account_settings.png differ diff --git a/website/static/img/teradata/dbt_cloud_teradata_development_credentials.png b/website/static/img/teradata/dbt_cloud_teradata_development_credentials.png new file mode 100644 index 00000000000..762fac961ac Binary files /dev/null and b/website/static/img/teradata/dbt_cloud_teradata_development_credentials.png differ diff --git a/website/static/img/teradata/dbt_cloud_teradata_setup_connection_start.png b/website/static/img/teradata/dbt_cloud_teradata_setup_connection_start.png new file mode 100644 index 00000000000..bbf4c6db380 Binary files /dev/null and b/website/static/img/teradata/dbt_cloud_teradata_setup_connection_start.png differ diff --git a/website/vercel.json b/website/vercel.json index e882b50d2fc..0674313f3f5 100644 --- a/website/vercel.json +++ b/website/vercel.json @@ -2,6 +2,31 @@ "cleanUrls": true, "trailingSlash": false, "redirects": [ + { + "source": "/docs/dbt-cloud-apis/sl-manifest", + "destination": "/reference/artifacts/sl-manifest", + "permanent": true + }, + { + "source": "/docs/cloud/dbt-assist-data", + "destination": "/docs/cloud/dbt-copilot-data", + "permanent": true + }, + { + "source": "/docs/cloud/use-dbt-assist", + "destination": "/docs/cloud/use-dbt-copilot", + "permanent": true + }, + { + "source": "/docs/cloud/enable-dbt-assist", + "destination": "/docs/cloud/enable-dbt-copilot", + "permanent": true + }, + { + "source": "/docs/cloud/dbt-assist", + "destination": "/docs/cloud/dbt-copilot", + "permanent": true + }, { "source": "/faqs/Troubleshooting/access_token_error", "destination": "/faqs/Troubleshooting/auth-expired-error",