From e62ca4cfe181911130b6ebb276b455ce8e4bb387 Mon Sep 17 00:00:00 2001 From: tlento Date: Mon, 30 Oct 2023 20:24:43 -0700 Subject: [PATCH 1/2] Remove capsys.disabled magic from CLI tests The addition of capsys.disabled calls to the CLI tests was done to work around an issue with files being opened and closed in the CLI setup fixtures. When running under high parallelism the CLI tests would occasionally be split up across worker instances, and so the file system operations - which were effectively creating and removing the same file - would cause failures due to race conditions between the different test runner fixture executions. Adding capsys.disabled didn't solve this problem, it merely made it somewhat less likely to occur, and ultimately replaced the error with a different one (file not found). This commit removes the capsys.disabled calls in favor of a follow-up to stop doing all of this filesystem manipulation. --- metricflow/test/cli/test_cli.py | 113 +++++++++++++------------------- 1 file changed, 45 insertions(+), 68 deletions(-) diff --git a/metricflow/test/cli/test_cli.py b/metricflow/test/cli/test_cli.py index e99c2e5e32..732416fae9 100644 --- a/metricflow/test/cli/test_cli.py +++ b/metricflow/test/cli/test_cli.py @@ -37,38 +37,30 @@ # TODO: Use snapshots to compare CLI output for all tests here. -def test_query(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(query, args=["--metrics", "bookings", "--group-by", "metric_time"]) +def test_query(cli_runner: MetricFlowCliRunner) -> None: # noqa: D + resp = cli_runner.run(query, args=["--metrics", "bookings", "--group-by", "metric_time"]) # case insensitive matches are needed for snowflake due to the capitalization thing engine_is_snowflake = cli_runner.cli_context.sql_client.sql_engine_type is SqlEngine.SNOWFLAKE assert "bookings" in resp.output or ("bookings" in resp.output.lower() and engine_is_snowflake) assert resp.exit_code == 0 -def test_list_dimensions(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(dimensions, args=["--metrics", "bookings"]) +def test_list_dimensions(cli_runner: MetricFlowCliRunner) -> None: # noqa: D + resp = cli_runner.run(dimensions, args=["--metrics", "bookings"]) assert "ds" in resp.output assert resp.exit_code == 0 -def test_list_metrics(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(metrics) +def test_list_metrics(cli_runner: MetricFlowCliRunner) -> None: # noqa: D + resp = cli_runner.run(metrics) assert "bookings_per_listing: listing__capacity_latest" in resp.output assert resp.exit_code == 0 -def test_get_dimension_values(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(dimension_values, args=["--metrics", "bookings", "--dimension", "booking__is_instant"]) +def test_get_dimension_values(cli_runner: MetricFlowCliRunner) -> None: # noqa: D + resp = cli_runner.run(dimension_values, args=["--metrics", "bookings", "--dimension", "booking__is_instant"]) actual_output_lines = sorted(resp.output.split("\n")) assert ["", "• False", "• True"] == actual_output_lines @@ -84,7 +76,7 @@ def create_directory(directory_path: str) -> Iterator[None]: shutil.rmtree(path) -def test_validate_configs(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D +def test_validate_configs(cli_runner: MetricFlowCliRunner) -> None: # noqa: D yaml_contents = textwrap.dedent( """\ semantic_model: @@ -111,24 +103,20 @@ def test_validate_configs(capsys: pytest.CaptureFixture, cli_runner: MetricFlowC manifest_file = target_directory / "semantic_manifest.json" manifest_file.write_text(manifest.json()) - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(validate_configs) + resp = cli_runner.run(validate_configs) assert "ERROR" in resp.output assert resp.exit_code == 0 -def test_health_checks(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(health_checks) +def test_health_checks(cli_runner: MetricFlowCliRunner) -> None: # noqa: D + resp = cli_runner.run(health_checks) assert "SELECT 1: Success!" in resp.output assert resp.exit_code == 0 -def test_tutorial_message(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: +def test_tutorial_message(cli_runner: MetricFlowCliRunner) -> None: """Tests the message output of the tutorial. The tutorial now essentially compiles a semantic manifest and then asks the user to run dbt seed, @@ -139,17 +127,14 @@ def test_tutorial_message(capsys: pytest.CaptureFixture, cli_runner: MetricFlowC project path overrides it might warrant a more complete test of the semantic manifest building steps in the tutorial flow. """ - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(tutorial, args=["-m"]) + resp = cli_runner.run(tutorial, args=["-m"]) assert "Please run the following steps" in resp.output assert "dbt seed" in resp.output -def test_list_entities(capsys: pytest.CaptureFixture, cli_runner: MetricFlowCliRunner) -> None: # noqa: D +def test_list_entities(cli_runner: MetricFlowCliRunner) -> None: # noqa: D # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run(entities, args=["--metrics", "bookings"]) + resp = cli_runner.run(entities, args=["--metrics", "bookings"]) assert "guest" in resp.output assert "host" in resp.output @@ -163,11 +148,9 @@ def test_saved_query( # noqa: D cli_runner: MetricFlowCliRunner, sql_client: SqlClient, ) -> None: - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run( - query, args=["--saved-query", "p0_booking", "--order", "metric_time__day,listing__capacity_latest"] - ) + resp = cli_runner.run( + query, args=["--saved-query", "p0_booking", "--order", "metric_time__day,listing__capacity_latest"] + ) assert resp.exit_code == 0 @@ -188,19 +171,17 @@ def test_saved_query_with_where( # noqa: D cli_runner: MetricFlowCliRunner, sql_client: SqlClient, ) -> None: - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run( - query, - args=[ - "--saved-query", - "p0_booking", - "--order", - "metric_time__day,listing__capacity_latest", - "--where", - "{{ Dimension('listing__capacity_latest') }} > 4", - ], - ) + resp = cli_runner.run( + query, + args=[ + "--saved-query", + "p0_booking", + "--order", + "metric_time__day,listing__capacity_latest", + "--where", + "{{ Dimension('listing__capacity_latest') }} > 4", + ], + ) assert resp.exit_code == 0 @@ -221,19 +202,17 @@ def test_saved_query_with_limit( # noqa: D cli_runner: MetricFlowCliRunner, sql_client: SqlClient, ) -> None: - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run( - query, - args=[ - "--saved-query", - "p0_booking", - "--order", - "metric_time__day,listing__capacity_latest", - "--limit", - "3", - ], - ) + resp = cli_runner.run( + query, + args=[ + "--saved-query", + "p0_booking", + "--order", + "metric_time__day,listing__capacity_latest", + "--limit", + "3", + ], + ) assert resp.exit_code == 0 @@ -251,12 +230,10 @@ def test_saved_query_explain( # noqa: D mf_test_session_state: MetricFlowTestSessionState, cli_runner: MetricFlowCliRunner, ) -> None: - # Disabling capsys to resolve error "ValueError: I/O operation on closed file". Better solution TBD. - with capsys.disabled(): - resp = cli_runner.run( - query, - args=["--explain", "--saved-query", "p0_booking", "--order", "metric_time__day,listing__capacity_latest"], - ) + resp = cli_runner.run( + query, + args=["--explain", "--saved-query", "p0_booking", "--order", "metric_time__day,listing__capacity_latest"], + ) # Currently difficult to compare explain output due to randomly generated IDs. assert resp.exit_code == 0 From f9b627f24698d8f517d900ec519142d6289a159d Mon Sep 17 00:00:00 2001 From: tlento Date: Mon, 30 Oct 2023 20:57:43 -0700 Subject: [PATCH 2/2] Make CLI tests work with parallel test execution We use pytest-xdist for parallel test execution, which means multiple worker threads fire up tests which are loosely grouped by class/module. Unfortunately, there are no guarantees around any of this, so the CLI tests, which are all in one module, may execute on different workers. We had a bad fixture layout on the CLI tests which would touch a file in the CLI directory and then unlink it after the test was done. If every test ran on the same worker this was fine - only one test would ever be accessing this path location at a time. But when the tests get split across workers bad things happen with FileNotFound and things of that nature. This change moves us off of having multiple tests creating and destroying the same file in the same file path location, instead relying on a fixed file that already exists in a static location in the repository. --- metricflow/test/cli/test_cli.py | 35 +++++++++++++++---- metricflow/test/fixtures/cli_fixtures.py | 14 ++++---- metricflow/test/fixtures/setup_fixtures.py | 9 +++++ .../test/fixtures/sql_client_fixtures.py | 5 ++- 4 files changed, 48 insertions(+), 15 deletions(-) diff --git a/metricflow/test/cli/test_cli.py b/metricflow/test/cli/test_cli.py index 732416fae9..dd7fcad4ff 100644 --- a/metricflow/test/cli/test_cli.py +++ b/metricflow/test/cli/test_cli.py @@ -15,6 +15,7 @@ from dbt_semantic_interfaces.parsing.objects import YamlConfigFile from dbt_semantic_interfaces.test_utils import base_semantic_manifest_file +from metricflow.cli.cli_context import CLIContext from metricflow.cli.main import ( dimension_values, dimensions, @@ -76,7 +77,19 @@ def create_directory(directory_path: str) -> Iterator[None]: shutil.rmtree(path) -def test_validate_configs(cli_runner: MetricFlowCliRunner) -> None: # noqa: D +def test_validate_configs(cli_context: CLIContext) -> None: + """Tests config validation from a manifest stored on the filesystem. + + This test is special, because the CLI bypasses the semantic manifest read into the CLIContext and + validates the config files on disk. It's not entirely clear why we do this, so we should probably + figure that out and, if possible, stop doing it so we can have this test depend on an injectable + in-memory semantic manifest instead of whatever is stored in the filesystem. + + At any rate, due to the direct read from disk, we have to store a serialized semantic manifest + in a temporary location. In order to spin up the CLI this requires us to ALSO have a dbt_project.yml + on the filesystem in the project path. Since we don't want to clobber whatever semantic_manifest.json is + in the real filesystem location we do all of this stuff here. + """ yaml_contents = textwrap.dedent( """\ semantic_model: @@ -98,16 +111,26 @@ def test_validate_configs(cli_runner: MetricFlowCliRunner) -> None: # noqa: D apply_transformations=False, ).semantic_manifest - target_directory = Path().absolute() / "target" - with create_directory(target_directory.as_posix()): - manifest_file = target_directory / "semantic_manifest.json" - manifest_file.write_text(manifest.json()) + project_directory = Path().absolute() + # If the dbt_project.yml doesn't exist in this path location the CLI will throw an exception. + dummy_project = Path(project_directory, "dbt_project.yml") + dummy_project.touch() - resp = cli_runner.run(validate_configs) + try: + cli_runner = MetricFlowCliRunner(cli_context=cli_context, project_path=str(project_directory)) + target_directory = Path(project_directory, "target") + with create_directory(target_directory.as_posix()): + manifest_file = Path(target_directory, "semantic_manifest.json") + manifest_file.write_text(manifest.json()) + + resp = cli_runner.run(validate_configs) assert "ERROR" in resp.output assert resp.exit_code == 0 + finally: + dummy_project.unlink() + def test_health_checks(cli_runner: MetricFlowCliRunner) -> None: # noqa: D resp = cli_runner.run(health_checks) diff --git a/metricflow/test/fixtures/cli_fixtures.py b/metricflow/test/fixtures/cli_fixtures.py index f2c81afbe6..70258ef9e9 100644 --- a/metricflow/test/fixtures/cli_fixtures.py +++ b/metricflow/test/fixtures/cli_fixtures.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import pathlib import tempfile from typing import Generator, Optional, Sequence @@ -17,6 +18,7 @@ from metricflow.model.semantic_manifest_lookup import SemanticManifestLookup from metricflow.plan_conversion.column_resolver import DunderColumnAssociationResolver from metricflow.protocols.sql_client import SqlClient +from metricflow.test.fixtures.setup_fixtures import dbt_project_dir from metricflow.test.time.configurable_time_source import ConfigurableTimeSource @@ -99,19 +101,19 @@ def cli_context( # noqa: D class MetricFlowCliRunner(CliRunner): """Custom CliRunner class to handle passing context.""" - def __init__(self, cli_context: CLIContext) -> None: # noqa: D + def __init__(self, cli_context: CLIContext, project_path: str) -> None: # noqa: D self.cli_context = cli_context + self.project_path = project_path super().__init__() def run(self, cli: click.BaseCommand, args: Optional[Sequence[str]] = None) -> Result: # noqa: D - # TODO: configure CLI to use a dbt_project fixture - dummy_dbt_project = pathlib.Path("dbt_project.yml") - dummy_dbt_project.touch() + current_dir = os.getcwd() + os.chdir(self.project_path) result = super().invoke(cli, args, obj=self.cli_context) - dummy_dbt_project.unlink() + os.chdir(current_dir) return result @pytest.fixture def cli_runner(cli_context: CLIContext) -> MetricFlowCliRunner: # noqa: D - return MetricFlowCliRunner(cli_context=cli_context) + return MetricFlowCliRunner(cli_context=cli_context, project_path=dbt_project_dir()) diff --git a/metricflow/test/fixtures/setup_fixtures.py b/metricflow/test/fixtures/setup_fixtures.py index 8512802b01..9c936d0f22 100644 --- a/metricflow/test/fixtures/setup_fixtures.py +++ b/metricflow/test/fixtures/setup_fixtures.py @@ -5,6 +5,7 @@ import os import warnings from dataclasses import dataclass +from pathlib import Path import _pytest.config import pytest @@ -170,6 +171,14 @@ def dialect_from_url(url: str) -> SqlDialect: return SqlDialect(dialect_protocol[0]) +def dbt_project_dir() -> str: + """Return the canonical path string for the dbt project dir in the test package. + + This is necessary for configuring both the dbt adapter for integration tests and the project location for CLI tests. + """ + return os.path.join(os.path.dirname(__file__), Path("dbt_projects", "metricflow_testing")) + + @pytest.fixture(scope="session", autouse=True) def warn_user_about_slow_tests_without_parallelism( # noqa: D request: FixtureRequest, diff --git a/metricflow/test/fixtures/sql_client_fixtures.py b/metricflow/test/fixtures/sql_client_fixtures.py index 8a60588b0d..b54e019f4d 100644 --- a/metricflow/test/fixtures/sql_client_fixtures.py +++ b/metricflow/test/fixtures/sql_client_fixtures.py @@ -11,7 +11,7 @@ from dbt.cli.main import dbtRunner from metricflow.protocols.sql_client import SqlClient -from metricflow.test.fixtures.setup_fixtures import MetricFlowTestSessionState, dialect_from_url +from metricflow.test.fixtures.setup_fixtures import MetricFlowTestSessionState, dbt_project_dir, dialect_from_url from metricflow.test.fixtures.sql_clients.adapter_backed_ddl_client import AdapterBackedDDLSqlClient from metricflow.test.fixtures.sql_clients.common_client import SqlDialect from metricflow.test.fixtures.sql_clients.ddl_sql_client import SqlClientWithDDLMethods @@ -127,8 +127,7 @@ def __initialize_dbt() -> None: We use the debug command to initialize the profile and make it accessible. This has the nice property of triggering a failure with reasonable error messages in the event the dbt configs are not set up correctly. """ - dbt_dir = os.path.join(os.path.dirname(__file__), "dbt_projects/metricflow_testing/") - dbtRunner().invoke(["debug"], project_dir=dbt_dir, PROFILES_DIR=dbt_dir) + dbtRunner().invoke(["debug"], project_dir=dbt_project_dir(), PROFILES_DIR=dbt_project_dir()) def make_test_sql_client(url: str, password: str, schema: str) -> SqlClientWithDDLMethods: