dbt-labs · MichelleArk · May 23, 2024 · May 10, 2024 · May 22, 2024 · May 22, 2024
@@ -0,0 +1,7 @@
+kind: Features
+body: Maximally parallelize dbt clone
+  in clone command"
+time: 2024-05-22T00:03:09.765977-04:00
+custom:
+  Author: michelleark
+  Issue: "7914"
@@ -25,8 +25,14 @@ class GraphQueue:
     the same time, as there is an unlocked race!
     """
 
-    def __init__(self, graph: nx.DiGraph, manifest: Manifest, selected: Set[UniqueId]) -> None:
-        self.graph = graph
+    def __init__(
+        self,
+        graph: nx.DiGraph,
+        manifest: Manifest,
+        selected: Set[UniqueId],
+        preserve_edges: bool = True,
+    ) -> None:
+        self.graph = graph if preserve_edges else nx.classes.function.create_empty_copy(graph)
         self.manifest = manifest
         self._selected = selected
         # store the queue as a priority queue.

@@ -319,7 +319,7 @@ def get_selected(self, spec: SelectionSpec) -> Set[UniqueId]:
 
         return filtered_nodes
 
-    def get_graph_queue(self, spec: SelectionSpec) -> GraphQueue:
+    def get_graph_queue(self, spec: SelectionSpec, preserve_edges: bool = True) -> GraphQueue:
         """Returns a queue over nodes in the graph that tracks progress of
         dependecies.
         """
@@ -330,7 +330,7 @@ def get_graph_queue(self, spec: SelectionSpec) -> GraphQueue:
         # Construct a new graph using the selected_nodes
         new_graph = self.full_graph.get_subset_graph(selected_nodes)
         # should we give a way here for consumers to mutate the graph?
-        return GraphQueue(new_graph.graph, self.manifest, selected_nodes)
+        return GraphQueue(new_graph.graph, self.manifest, selected_nodes, preserve_edges)
 
 
 class ResourceTypeSelector(NodeSelector):

@@ -10,7 +10,7 @@
 from dbt.node_types import REFABLE_NODE_TYPES
 from dbt.task.base import BaseRunner, resource_types_from_args
 from dbt.task.run import _validate_materialization_relations_dict
-from dbt.task.runnable import GraphRunnableTask
+from dbt.task.runnable import GraphRunnableMode, GraphRunnableTask
 from dbt_common.dataclass_schema import dbtClassMixin
 from dbt_common.exceptions import CompilationError, DbtInternalError
 
@@ -94,6 +94,9 @@ class CloneTask(GraphRunnableTask):
     def raise_on_first_error(self):
         return False
 
+    def get_run_mode(self) -> GraphRunnableMode:
+        return GraphRunnableMode.Independent
+
     def _get_deferred_manifest(self) -> Optional[Manifest]:
         # Unlike other commands, 'clone' always requires a state manifest
         # Load previous state, regardless of whether --defer flag has been set

@@ -57,6 +57,7 @@
 from dbt.parser.manifest import write_manifest
 from dbt.task.base import BaseRunner, ConfiguredTask
 from dbt_common.context import _INVOCATION_CONTEXT_VAR, get_invocation_context
+from dbt_common.dataclass_schema import StrEnum
 from dbt_common.events.contextvars import log_contextvars, task_contextvars
 from dbt_common.events.functions import fire_event, warn_or_error
 from dbt_common.events.types import Formatting
@@ -68,6 +69,11 @@
 RUNNING_STATE = DbtProcessState("running")
 
 
+class GraphRunnableMode(StrEnum):
+    Topological = "topological"
+    Independent = "independent"
+
+
 class GraphRunnableTask(ConfiguredTask):
     MARK_DEPENDENT_ERRORS_STATUSES = [NodeStatus.Error]
 
@@ -145,7 +151,15 @@
         selector = self.get_node_selector()
         # Following uses self.selection_arg and self.exclusion_arg
         spec = self.get_selection_spec()
-        return selector.get_graph_queue(spec)
+
+        preserve_edges = True
+        if self.get_run_mode() == GraphRunnableMode.Independent:
+            preserve_edges = False
+
+        return selector.get_graph_queue(spec, preserve_edges)
+
+    def get_run_mode(self) -> GraphRunnableMode:
+        return GraphRunnableMode.Topological
 
     def _runtime_initialize(self):
         self.compile_manifest()

@@ -50,6 +50,7 @@
     MockNode,
     MockSource,
     inject_plugin,
+    make_manifest,
 )
 
 REQUIRED_PARSED_NODE_KEYS = frozenset(
@@ -1090,20 +1091,6 @@ def setUp(self):
         )
 
 
-def make_manifest(nodes=[], sources=[], macros=[], docs=[]):
-    return Manifest(
-        nodes={n.unique_id: n for n in nodes},
-        macros={m.unique_id: m for m in macros},
-        sources={s.unique_id: s for s in sources},
-        docs={d.unique_id: d for d in docs},
-        disabled={},
-        files={},
-        exposures={},
-        metrics={},
-        selectors={},
-    )
-
-
 FindMacroSpec = namedtuple("FindMacroSpec", "macros,expected")
 
 macro_parameter_sets = [

@@ -0,0 +1,47 @@
+import networkx as nx
+import pytest
+
+from dbt.contracts.graph.manifest import Manifest
+from dbt.graph.queue import GraphQueue
+from tests.unit.utils import MockNode, make_manifest
+
+
+class TestGraphQueue:
+    @pytest.fixture(scope="class")
+    def manifest(self) -> Manifest:
+        return make_manifest(
+            nodes=[
+                MockNode(package="test_package", name="upstream_model"),
+                MockNode(package="test_package", name="downstream_model"),
+            ]
+        )
+
+    @pytest.fixture(scope="class")
+    def graph(self) -> nx.DiGraph:
+        graph = nx.DiGraph()
+        graph.add_edge("model.test_package.upstream_model", "model.test_package.downstream_model")
+        return graph
+
+    def test_init_graph_queue(self, manifest, graph):
+        graph_queue = GraphQueue(graph=graph, manifest=manifest, selected={})
+
+        assert graph_queue.manifest == manifest
+        assert graph_queue.graph == graph
+        assert graph_queue.inner.queue == [(0, "model.test_package.upstream_model")]
+        assert graph_queue.in_progress == set()
+        assert graph_queue.queued == {"model.test_package.upstream_model"}
+        assert graph_queue.lock
+
+    def test_init_graph_queue_preserve_edges_false(self, manifest, graph):
+        graph_queue = GraphQueue(graph=graph, manifest=manifest, selected={}, preserve_edges=False)
+
+        # when preserve_edges is set to false, dependencies between nodes are no longer tracked in the priority queue
+        assert list(graph_queue.graph.edges) == []
+        assert graph_queue.inner.queue == [
+            (0, "model.test_package.downstream_model"),
+            (0, "model.test_package.upstream_model"),
+        ]
+        assert graph_queue.queued == {
+            "model.test_package.upstream_model",
+            "model.test_package.downstream_model",
+        }
@@ -1,10 +1,14 @@
 from dataclasses import dataclass
-from typing import AbstractSet, Any, Dict, Optional
+from typing import AbstractSet, Any, Dict, List, Optional, Tuple
 
+import networkx as nx
 import pytest
 
-from dbt.task.runnable import GraphRunnableTask
+from dbt.artifacts.resources.types import NodeType
+from dbt.graph import Graph, ResourceTypeSelector
+from dbt.task.runnable import GraphRunnableMode, GraphRunnableTask
 from dbt.tests.util import safe_set_invocation_context
+from tests.unit.utils import MockNode, make_manifest
 
 
 @dataclass
@@ -14,6 +18,9 @@ class MockArgs:
     state: Optional[Dict[str, Any]] = None
     defer_state: Optional[Dict[str, Any]] = None
     write_json: bool = False
+    selector: Optional[str] = None
+    select: Tuple[str] = ()
+    exclude: Tuple[str] = ()
 
 
 @dataclass
@@ -23,12 +30,28 @@ class MockConfig:
     threads: int = 1
     target_name: str = "mock_config_target_name"
 
+    def get_default_selector_name(self):
+        return None
+
 
 class MockRunnableTask(GraphRunnableTask):
-    def __init__(self, exception_class: Exception = Exception):
+    def __init__(
+        self,
+        exception_class: Exception = Exception,
+        nodes: Optional[List[MockNode]] = None,
+        edges: Optional[List[Tuple[str, str]]] = None,
+    ):
+        nodes = nodes or []
+        edges = edges or []
+
         self.forced_exception_class = exception_class
         self.did_cancel: bool = False
         super().__init__(args=MockArgs(), config=MockConfig(), manifest=None)
+        self.manifest = make_manifest(nodes=nodes)
+        digraph = nx.DiGraph()
+        for edge in edges:
+            digraph.add_edge(edge[0], edge[1])
+        self.graph = Graph(digraph)
 
     def run_queue(self, pool):
         """Override `run_queue` to raise a system exit"""
@@ -40,13 +63,25 @@ def _cancel_connections(self, pool):
 
     def get_node_selector(self):
         """This is an `abstract_method` on `GraphRunnableTask`, thus we must implement it"""
-        return None
+        selector = ResourceTypeSelector(
+            graph=self.graph,
+            manifest=self.manifest,
+            previous_state=self.previous_state,
+            resource_types=[NodeType.Model],
+            include_empty_nodes=True,
+        )
+        return selector
 
     def defer_to_manifest(self, adapter, selected_uids: AbstractSet[str]):
         """This is an `abstract_method` on `GraphRunnableTask`, thus we must implement it"""
         return None
 
 
+class MockRunnableTaskIndependent(MockRunnableTask):
+    def get_run_mode(self) -> GraphRunnableMode:
+        return GraphRunnableMode.Independent
+
+
 def test_graph_runnable_task_cancels_connection_on_system_exit():
 
     safe_set_invocation_context()
@@ -81,3 +116,36 @@ def test_graph_runnable_task_doesnt_cancel_connection_on_generic_exception():
 
     # If `did_cancel` is True, that means `_cancel_connections` was called
     assert task.did_cancel is False
+
+
+def test_graph_runnable_preserves_edges_by_default():
+    task = MockRunnableTask(
+        nodes=[
+            MockNode("test", "upstream_node", fqn="model.test.upstream_node"),
+            MockNode("test", "downstream_node", fqn="model.test.downstream_node"),
+        ],
+        edges=[("model.test.upstream_node", "model.test.downstream_node")],
+    )
+    assert task.get_run_mode() == GraphRunnableMode.Topological
+    graph_queue = task.get_graph_queue()
+
+    assert graph_queue.queued == {"model.test.upstream_node"}
+    assert graph_queue.inner.queue == [(0, "model.test.upstream_node")]
+
+
+def test_graph_runnable_preserves_edges_false():
+    task = MockRunnableTaskIndependent(
+        nodes=[
+            MockNode("test", "upstream_node", fqn="model.test.upstream_node"),
+            MockNode("test", "downstream_node", fqn="model.test.downstream_node"),
+        ],
+        edges=[("model.test.upstream_node", "model.test.downstream_node")],
+    )
+    assert task.get_run_mode() == GraphRunnableMode.Independent
+    graph_queue = task.get_graph_queue()
+
+    assert graph_queue.queued == {"model.test.downstream_node", "model.test.upstream_node"}
+    assert graph_queue.inner.queue == [
+        (0, "model.test.downstream_node"),
+        (0, "model.test.upstream_node"),
+    ]
@@ -11,6 +11,7 @@
 import pytest
 
 from dbt.config.project import PartialProject
+from dbt.contracts.graph.manifest import Manifest
 from dbt_common.dataclass_schema import ValidationError
 
 
@@ -387,3 +388,17 @@ def replace_config(n, **kwargs):
         config=n.config.replace(**kwargs),
         unrendered_config=dict_replace(n.unrendered_config, **kwargs),
     )
+
+
+def make_manifest(nodes=[], sources=[], macros=[], docs=[]) -> Manifest:
 def manifest( 
 def manifest( 
+    return Manifest(
+        nodes={n.unique_id: n for n in nodes},
+        macros={m.unique_id: m for m in macros},
+        sources={s.unique_id: s for s in sources},
+        docs={d.unique_id: d for d in docs},
+        disabled={},
+        files={},
+        exposures={},
+        metrics={},
+        selectors={},
+    )