From d5feec0125019cf0ff6efa9b3dcffd98e5bb6161 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 10 Oct 2023 15:59:19 +0100 Subject: [PATCH 01/13] chore: Add a custom formatter for ehrQL Based on Python, but allows us to mark complete ehrQL dataset definitions distinctly. --- mkdocs.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 9a4f319c9..9dec305a0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -71,7 +71,10 @@ markdown_extensions: format: !!python/name:pymdownx.superfences.fence_code_format - toc: permalink: "🔗" - - pymdownx.highlight + - pymdownx.highlight: + extend_pygments_lang: + - name: ehrql + lang: python - pymdownx.superfences - pymdownx.tabbed: alternate_style: true From 3fd3da520f087aa29e5db9dd6a839ca18af389c3 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 2 Nov 2023 18:11:49 +0000 Subject: [PATCH 02/13] chore: Remove duplicate key in `mkdocs.yml` --- mkdocs.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/mkdocs.yml b/mkdocs.yml index 9dec305a0..71898819c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,7 +75,6 @@ markdown_extensions: extend_pygments_lang: - name: ehrql lang: python - - pymdownx.superfences - pymdownx.tabbed: alternate_style: true - pymdownx.keys From 9c8b8dbe1365a55722aedbd5f19c432b0fb4f2f3 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 2 Nov 2023 19:25:40 +0000 Subject: [PATCH 03/13] test: Add code to test the documentation examples This code has been subject to considerable work to get it into this form. However, it did not seem useful to retain the various approaches and versions of the code before this state. A quick guide to this code: * It finds any Markdown files in `docs/`. * It uses the SuperFences extension, as we do in the MkDocs configuration, to extract Markdown code blocks labelled with `ehrql` syntax. These are assumed to be self-contained dataset definitions. * The code blocks that will be tested should appear as code blocks in the documentation, by default (provided the CSS isn't changed to modify the appearance of code blocks somehow, which shouldn't be the case, because why would you?). They are identified in the parametrized tests by their ordinal fence number in the source file. * It finds any Python modules indicated by a `.py` extension. Python modules are assumed to be self-contained dataset definitions. * The found dataset definitions are run to generate a dataset, and the output checked to see if it's a CSV. There is some monkeypatching necessary to make this work: * `codelist_from_csv()` relies on having CSV data available, and the checks on valid codelist codes are patched out. Without further work, we don't have any direct way of including data for inline dataset definitions in Markdown source, or specifying which mock CSV data to use without any established convention for examples to use. #1697 proposes ideas to remove this monkeypatching further. * The sandboxing code is monkeypatched out to use "unsafe" loading of dataset definitions. Without doing so, it is not possible to monkeypatch any other ehrQL code: the ehrQL is run in a subprocess otherwise. For more details and discussion, see the related PR for this code (#1648) and the previous PR (#1475) which this approach replaces. --- tests/docs/__init__.py | 0 tests/docs/test_complete_examples.py | 292 +++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 tests/docs/__init__.py create mode 100644 tests/docs/test_complete_examples.py diff --git a/tests/docs/__init__.py b/tests/docs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/docs/test_complete_examples.py b/tests/docs/test_complete_examples.py new file mode 100644 index 000000000..48247dddb --- /dev/null +++ b/tests/docs/test_complete_examples.py @@ -0,0 +1,292 @@ +import csv +import inspect +import typing +import unittest.mock +import uuid +from collections.abc import Generator, Iterator +from dataclasses import dataclass +from pathlib import Path + +import markdown +import mkdocs.config +import pytest + +import ehrql.main + + +@dataclass +class MarkdownFence: + """Represents a Markdown fence.""" + + source: str + language: str + + +class MarkdownFenceExtractor: + """Extracts fences from a markdown.Markdown object using the SuperFences extension. + + See https://facelessuser.github.io/pymdown-extensions/extensions/superfences/ + """ + + def __init__(self, content: str) -> None: + self.fences: list[MarkdownFence] = [] + self.extension_configs: typing.Mapping[ + str, typing.Any + ] = self._configure_superfences() + self._extract_fences(content) + + def _fence_null_format( + # Argument types are taken from mkdocs-code-validator, + # which has an MIT license, + # but they will be removed before merge. + self, + src: str, + language: str, + css_class: str | None, + options: typing.Mapping[str, typing.Any], + md: markdown.Markdown, + **kwargs, + ) -> str: + """Extract the fences in the same way + as the SuperFences extension does, and make them accessible to the test code. + + Returns an empty string. + + This null formatter exists only for this purpose. + See https://facelessuser.github.io/pymdown-extensions/extensions/superfences/#formatters + + "All formatters should return a string as HTML." + + We don't require the formatted text, + only that this method is run and we can access the source + and language.""" + self.fences.append(MarkdownFence(source=src, language=language)) + return "" + + def _configure_superfences(self): + """Retrieves the existing extensions settings from the mkdocs.yml + configuration, replacing any custom SuperFences fences with a special + test custom fence to extract all fences.""" + config_path = Path(__file__).parents[2] / "mkdocs.yml" + config = mkdocs.config.load_config(config_file_path=str(config_path)) + assert "pymdownx.superfences" in config["markdown_extensions"] + config["mdx_configs"]["pymdownx.superfences"]["custom_fences"] = [ + { + # "name" specifies fences to extract. + "name": "*", + "class": "test", + "format": self._fence_null_format, + }, + ] + return config["mdx_configs"] + + def _extract_fences(self, content: str) -> None: + markdown.Markdown( + extensions=["pymdownx.superfences"], + extension_configs=self.extension_configs, + ).convert(content) + + +@dataclass +class DatasetDefinitionExample: + """Stores details of a complete ehrQL dataset definition example. + + The origin of such an example may be a Markdown fence, + or a standalone Python module.""" + + path: Path + # This fence number count includes all fences, + # not just the ehrQL fences. + # Standalone Python modules are not given a fence number. + fence_number: int | None + source: str + + def relative_path(self) -> Path: + """Return the relative path of the dataset definition source file + to the source code root.""" + source_code_path = Path(__file__).parents[2] + return self.path.relative_to(source_code_path) + + +def discover_paths(glob_string: str) -> Generator[Path, None, None]: + """Generate a list of matching files for a glob in the documentation source path.""" + docs_path = Path(__file__).parents[2] / "docs" + return docs_path.glob(glob_string) + + +def find_complete_ehrql_examples_in_markdown( + file: typing.TextIO, +) -> Iterator[DatasetDefinitionExample]: + """Yields extracted code blocks labelled as ```ehrql from a Markdown file. + + Incomplete ehrQL dataset definitions should be labelled as ```python, + and not with ```ehrql.""" + f = MarkdownFenceExtractor(file.read()) + + for fence_number, fence in enumerate(f.fences, start=1): + if fence.language == "ehrql": + example = DatasetDefinitionExample( + path=Path(file.name), + source=fence.source, + fence_number=fence_number, + ) + yield example + + +def generate_complete_ehrql_examples() -> ( + Generator[DatasetDefinitionExample, None, None] +): + """Yields all complete ehrQL DatasetDefinitionExamples from the Markdown documentation.""" + markdown_paths = list(discover_paths("**/*.md")) + assert len(markdown_paths) > 0, "No Markdown files found" + + for p in markdown_paths: + with open(p) as f: + yield from find_complete_ehrql_examples_in_markdown(f) + + dataset_definition_source_paths = list(discover_paths("**/*.py")) + assert len(dataset_definition_source_paths) > 0, "No .py files found" + + for p in dataset_definition_source_paths: + with open(p) as f: + content = f.read() + assert len(content) > 0 + yield DatasetDefinitionExample( + path=Path(f.name), + fence_number=None, + source=content, + ) + + +def create_example_test_case_id(example: DatasetDefinitionExample) -> str: + """Returns a test case ID for pytest from a specific DatasetDefinitionExample.""" + test_id = f"{example.relative_path()}" + if example.fence_number is not None: + test_id += f"; fence {example.fence_number}" + return test_id + + +def validate_dataset_output(dataset_path: Path) -> None: + """Validates that an output dataset file is a CSV.""" + with open(dataset_path) as f: + csv_content = f.readlines() + + # If the dataset definition works, we should have a valid CSV. + assert len(csv_content) > 0, "CSV is empty for dataset" + + # Check we can read the CSV content. + csv_reader = csv.DictReader(csv_content) + for row in csv_reader: + pass + + +class DatasetDefinitionTestError(Exception): + pass + + +@pytest.mark.parametrize( + "example", + generate_complete_ehrql_examples(), + ids=create_example_test_case_id, +) +def test_ehrql_generate_dataset_example( + tmp_path: Path, example: DatasetDefinitionExample +) -> None: + tmp_filename_base = str(uuid.uuid4()) + + tmp_dataset_definition_path = tmp_path / (tmp_filename_base + ".py") + tmp_dataset_definition_path.write_text(example.source) + + tmp_dataset_path = tmp_path / (tmp_filename_base + ".csv") + + code_column_name = "code" + category_column_name = "category" + tmp_codelist_path = tmp_path / (tmp_filename_base + "_codelist.csv") + tmp_codelist_path.write_text( + f"{code_column_name},{category_column_name}\n" + "not_a_real_code!,not_a_real_category!" + ) + + codelist_fn = ehrql.codelist_from_csv + + def wrapped_codelist_from_csv(*args, **kwargs): + """Returns the result from ehrql.codelist_from_csv. + + This is used to monkeypatch the real ehrql.codelist_from_csv + so that we can use a mock CSV, but it: + + * validates the function arguments + * calls the real function with the mock CSV data + + Different documentation examples may refer to different CSV columns. + Because of this, we change the arguments passed to codelist_from_csv(). + """ + codelist_fn_signature = inspect.signature(codelist_fn) + try: + codelist_fn_signature.bind(*args, **kwargs) + except TypeError as e: + e.add_note("codelist_from_csv() given incorrect arguments") + raise e + + return codelist_fn( + filename=tmp_codelist_path, + column=code_column_name, + category_column=category_column_name, + ) + + def wrapped_load_dataset_definition(definition_file, user_args, _): + """Wraps ehrql.load_dataset_definition to use the unsafe version + that runs the dataset definition in the same process, + without sandboxing. + + This is to remove the additional environ argument that is not used in + load_dataset_definition_unsafe.""" + return ehrql.loaders.load_dataset_definition_unsafe(definition_file, user_args) + + formatted_example = f"\nEXAMPLE FILENAME {example.path}\nEXAMPLE START\n{example.source}\nEXAMPLE END" + + with ( + # Patch out the sandbox for now to use the unsafe loader. + # This allows the subsequent monkeypatching of codelist_from_csv. + # By patching load_dataset_definition, + # we can still use the existing ehrql.main.generate_dataset function. + unittest.mock.patch( + "ehrql.main.load_dataset_definition", + wraps=wrapped_load_dataset_definition, + ), + unittest.mock.patch( + "ehrql.codelist_from_csv", + wraps=wrapped_codelist_from_csv, + ), + # There is no codelist code that satisfies constraints of all code systems, + # so patch out the validity check and just pass in a fake codelist. + unittest.mock.patch( + "ehrql.codes.BaseCode.__post_init__", + return_value=None, + ), + ): + try: + # No name needed to store a value: + # the output CSV gets written to a temporary file. + ehrql.main.generate_dataset( + tmp_dataset_definition_path, + tmp_dataset_path, + dsn=None, + backend_class=None, + query_engine_class=None, + dummy_tables_path=None, + dummy_data_file=None, + environ={}, + user_args=(), + ) + except Exception as e: + raise DatasetDefinitionTestError( + f"generate_dataset failed for example: {formatted_example}" + ) from e + + try: + validate_dataset_output(tmp_dataset_path) + except Exception as e: + raise DatasetDefinitionTestError( + f"Check of output dataset CSV failed for example: \n{formatted_example}" + ) from e From eba714b8f51455f28ebbddcab495552b4ba9e06c Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 2 Nov 2023 22:54:36 +0000 Subject: [PATCH 04/13] test: Test the documentation example testing code The code used to test the documentation examples is perhaps a little more complicated than other tests, and uses a third-party extension to extract the Markdown fences. These tests: * check that the documentation test code that runs `generate_dataset()` works * check that the fence extraction code continues to behave as we expect; it's the pymdownx extension that does this * document the existing known behaviour of the fences extraction --- tests/docs/test_find_docs_examples.py | 197 ++++++++++++++++++ .../docs/test_run_generate_dataset_example.py | 91 ++++++++ 2 files changed, 288 insertions(+) create mode 100644 tests/docs/test_find_docs_examples.py create mode 100644 tests/docs/test_run_generate_dataset_example.py diff --git a/tests/docs/test_find_docs_examples.py b/tests/docs/test_find_docs_examples.py new file mode 100644 index 000000000..42db38923 --- /dev/null +++ b/tests/docs/test_find_docs_examples.py @@ -0,0 +1,197 @@ +import textwrap +from io import StringIO +from pathlib import Path + +import pytest + +from . import test_complete_examples + + +# The SuperFences extension that we use has its own test suite. +# The tests below cover only the most common cases +# and are to help document and catch changes in behaviour +# that we may be interested in. +@pytest.mark.parametrize( + "fence,expected_dataset_definition_example", + [ + pytest.param( + textwrap.dedent( + """\ + ```ehrql + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), + fence_number=1, + source="", + ), + ], + id="fence with no lines", + ), + pytest.param( + textwrap.dedent( + """\ + ```ehrql + some code + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), + fence_number=1, + source="some code", + ), + ], + id="fence with one line", + ), + pytest.param( + textwrap.dedent( + """\ + ```ehrql + some code + more code + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), + fence_number=1, + source="some code\nmore code", + ), + ], + id="fence with multiple lines", + ), + pytest.param( + textwrap.dedent( + """\ + ```ehrql + some code + ```ehrql + more code + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), + fence_number=1, + source="some code\n```ehrql\nmore code", + ), + ], + id="open fence", + ), + pytest.param( + textwrap.dedent( + """\ + some text + ```ehrql + some code + ``` + more text + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), + fence_number=1, + source="some code", + ), + ], + id="fence between text", + ), + pytest.param( + textwrap.dedent( + """\ + ```ehrql + some code + ``` + some text + ```ehrql + more code + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), fence_number=1, source="some code" + ), + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), fence_number=2, source="more code" + ), + ], + id="multiple fences", + ), + pytest.param( + textwrap.dedent( + """\ + > ```ehrql + some code + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), fence_number=1, source="some code" + ), + ], + id="fence in quote", + ), + # List marker must be followed by a blank line with at least one space, + # and the fence starts on next line. + # Use an explicit newline character so that we can: + # * keep the desired formatting + # * avoid complaints from tooling about trailing whitespace + pytest.param( + textwrap.dedent( + """\ + * \n + ```ehrql + some code + ``` + """ + ), + [ + test_complete_examples.DatasetDefinitionExample( + path=Path("test"), fence_number=1, source="some code" + ), + ], + id="fence in list", + ), + pytest.param( + textwrap.dedent( + """\ + ```ehrql + some code + """ + ), + [], + id="fence at end of file", + ), + pytest.param( + textwrap.dedent( + """\ + ```python + some code + ``` + """ + ), + [], + id="fence with non-matching syntax", + ), + ], +) +def test_find_docs_examples(fence, expected_dataset_definition_example): + example = StringIO(fence) + # Unlike file objects, StringIO objects do not have a name. + # In the relevant code being tested, + # we access the file's name to save somewhat redundantly passing the name. + example.name = "test" + + result = list( + test_complete_examples.find_complete_ehrql_examples_in_markdown(example) + ) + assert result == expected_dataset_definition_example diff --git a/tests/docs/test_run_generate_dataset_example.py b/tests/docs/test_run_generate_dataset_example.py new file mode 100644 index 000000000..6ff3af66b --- /dev/null +++ b/tests/docs/test_run_generate_dataset_example.py @@ -0,0 +1,91 @@ +import textwrap +import unittest + +import pytest + +from . import test_complete_examples + + +def test_run_generate_dataset_example(tmp_path): + example = test_complete_examples.DatasetDefinitionExample( + path="test", + fence_number=1, + source=textwrap.dedent( + """\ + from ehrql import create_dataset + from ehrql.tables.beta.tpp import patients + + dataset = create_dataset() + dataset.define_population(patients.exists_for_patient()) + """ + ), + ) + test_complete_examples.test_ehrql_generate_dataset_example(tmp_path, example) + + +def test_run_generate_dataset_example_failing(tmp_path): + example = test_complete_examples.DatasetDefinitionExample( + path="test", + fence_number=1, + source=textwrap.dedent( + """\ + from ehrql import create_dataset + + dataset = create_dataset() + dataset.define_population(not_a_function()) + """ + ), + ) + with pytest.raises(test_complete_examples.DatasetDefinitionTestError) as exc_info: + test_complete_examples.test_ehrql_generate_dataset_example(tmp_path, example) + assert type(exc_info.value) is test_complete_examples.DatasetDefinitionTestError + + +def test_run_generate_dataset_example_failing_codelist_from_csv_call(tmp_path): + example = test_complete_examples.DatasetDefinitionExample( + path="test", + fence_number=1, + source=textwrap.dedent( + """\ + from ehrql import codelist_from_csv, create_dataset + from ehrql.tables.beta.tpp import patients + + codes = codelist_from_csv() + + dataset = create_dataset() + dataset.define_population(patients.exists_for_patient()) + """ + ), + ) + + with pytest.raises( + test_complete_examples.DatasetDefinitionTestError, + match=r"generate_dataset failed for example", + ) as exc_info: + test_complete_examples.test_ehrql_generate_dataset_example(tmp_path, example) + assert type(exc_info.value) is test_complete_examples.DatasetDefinitionTestError + + +def test_run_generate_dataset_example_gives_unreadable_csv(tmp_path): + example = test_complete_examples.DatasetDefinitionExample( + path="test", + fence_number=1, + source=textwrap.dedent( + """\ + from ehrql import create_dataset + from ehrql.tables.beta.tpp import patients + + dataset = create_dataset() + dataset.define_population(patients.exists_for_patient()) + """ + ), + ) + with unittest.mock.patch("ehrql.main.generate_dataset", return_value=None): + with pytest.raises( + test_complete_examples.DatasetDefinitionTestError, + match=r"Check of output dataset CSV failed for example", + ) as exc_info: + test_complete_examples.test_ehrql_generate_dataset_example( + tmp_path, example + ) + assert type(exc_info.value) is test_complete_examples.DatasetDefinitionTestError From fac9dd666de4255b1c2ae45f306887a442344747 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Wed, 18 Oct 2023 15:59:11 +0100 Subject: [PATCH 05/13] chore: Add `just` recipe for testing docs examples --- Justfile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Justfile b/Justfile index 5ff19cd50..7f829c486 100644 --- a/Justfile +++ b/Justfile @@ -180,6 +180,10 @@ test-backend-validation *ARGS: devenv test-docker *ARGS: devenv $BIN/python -m pytest tests/docker {{ ARGS }} +# Run the docs examples tests only. Optional args are passed to pytest. +test-docs-examples *ARGS: devenv + $BIN/python -m pytest tests/docs {{ ARGS }} + # Run the integration tests only. Optional args are passed to pytest. test-integration *ARGS: devenv $BIN/python -m pytest tests/integration {{ ARGS }} From daa964e8358984887b6507586410925385bd0f77 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 10 Oct 2023 16:00:18 +0100 Subject: [PATCH 06/13] docs: Use `ehrql` in complete dataset definitions In the examples; to enable tests to run for them. --- docs/explanation/running-ehrql.md | 2 +- docs/how-to/examples.md | 74 +++++++++---------- docs/index.md | 2 +- .../writing-a-dataset-definition/index.md | 2 +- 4 files changed, 40 insertions(+), 40 deletions(-) diff --git a/docs/explanation/running-ehrql.md b/docs/explanation/running-ehrql.md index 2cc38adaf..f6cdbb6b1 100644 --- a/docs/explanation/running-ehrql.md +++ b/docs/explanation/running-ehrql.md @@ -187,7 +187,7 @@ to produce an output file that you can inspect. into a new file called `dataset_definition.py` and save it in your `learning-ehrql` directory: -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients, medications diff --git a/docs/how-to/examples.md b/docs/how-to/examples.md index ecfca6188..c60b42b17 100644 --- a/docs/how-to/examples.md +++ b/docs/how-to/examples.md @@ -48,7 +48,7 @@ You can see an example of [how to access these categories within your dataset de ### Finding each patient's age -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients @@ -58,7 +58,7 @@ dataset.age = patients.age_on("2023-01-01") Alternatively, using a native Python `date`: -```python +```ehrql from datetime import date from ehrql import create_dataset from ehrql.tables.beta.core import patients @@ -69,7 +69,7 @@ dataset.age = patients.age_on(date(2023, 1, 1)) Or using an `index_date` variable: -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients @@ -80,7 +80,7 @@ dataset.age = patients.age_on(index_date) ### Assigning each patient an age band -```python +```ehrql from ehrql import create_dataset, case, when from ehrql.tables.beta.core import patients @@ -98,7 +98,7 @@ dataset.age_band = case( ### Finding each patient's date of birth -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients @@ -108,7 +108,7 @@ dataset.date_of_birth = patients.date_of_birth ### Finding each patient's date of death in their primary care record -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients @@ -125,7 +125,7 @@ By contrast, cause of death is often not accurate in the primary care record so ### Finding each patient's date, place, and cause of death from ONS records -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import ons_deaths @@ -140,7 +140,7 @@ dataset.cause_of_death = last_ons_death.cause_of_death_01 ### Finding each patient's sex -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients @@ -154,7 +154,7 @@ The possible values are "female", "male", "intersex", and "unknown". Ethnicity can be defined using a codelist. There are a lot of individual codes that can used to indicate a patients' fine-grained ethnicity. To make analysis more manageable, ethnicity is therefore commonly grouped into higher level categories. Above, we described how you can [import codelists that have a category column](#some-examples-using-codelist_from_csv). You can use a codelist with a category column to map clinical event codes for ethnicity to higher level categories as in this example: -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import clinical_events from ehrql import codelist_from_csv @@ -183,7 +183,7 @@ latest_ethnicity_group = dataset.latest_ethnicity_code.to_category( ### Finding each patient's IMD rank -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import addresses @@ -198,7 +198,7 @@ See [this code comment](https://github.com/opensafely-core/ehrql/blob/d29ff8ab2c ### Calculating each patient's IMD quintile -```python +```ehrql from ehrql import create_dataset, case, when from ehrql.tables.beta.tpp import addresses @@ -216,7 +216,7 @@ dataset.imd_quintile = case( ### Finding each patient's rural/urban classification -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import addresses @@ -262,7 +262,7 @@ dataset.msoa = address.msoa ### Finding each patient's practice's pseudonymised identifier -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import practice_registrations @@ -272,7 +272,7 @@ dataset.practice = practice_registrations.for_patient_on("2023-01-01").practice_ ### Finding each patient's practice's STP -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import practice_registrations @@ -307,7 +307,7 @@ dataset.region = registration.nuts1_region_name ### Does each patient have a clinical event matching a code in a codelist? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -321,7 +321,7 @@ dataset.has_had_asthma_diagnosis = clinical_events.where( ### Does each patient have a clinical event matching a code in a codelist in a time period? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -337,7 +337,7 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ### Does each patient have a medication event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import medications @@ -353,7 +353,7 @@ dataset.has_recent_statin_prescription = medications.where( ### Does each patient have a hospitalisation event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.tpp import hospital_admissions @@ -369,7 +369,7 @@ dataset.has_recent_cardiac_admission = hospital_admissions.where( ## How many events does each patient have matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import medications @@ -390,7 +390,7 @@ Frames can be sorted by calling the `sort_by()` method with the column to sort t ### What is the earliest/latest clinical event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -406,7 +406,7 @@ dataset.first_asthma_diagnosis_date = clinical_events.where( ).first_for_patient().date ``` -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -424,7 +424,7 @@ dataset.last_asthma_diagnosis_date = clinical_events.where( ### What is the earliest/latest medication event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import medications @@ -440,7 +440,7 @@ dataset.first_statin_prescription_date = medications.where( ).first_for_patient().date ``` -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import medications @@ -474,7 +474,7 @@ dataset.first_cardiac_hospitalisation_date = hospital_admissions.where( ).first_for_patient().date ``` -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import medications @@ -492,7 +492,7 @@ dataset.last_cardiac_hospitalisation_date = medications.where( ### What is the clinical event, matching some criteria, with the least/greatest value? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -517,7 +517,7 @@ dataset.date_of_max_hba1c_observed = clinical_events.where(clinical_events.snome ### What is the code of the first/last clinical event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -535,7 +535,7 @@ dataset.first_asthma_diagnosis_date = clinical_events.where( ### What is the date of the first/last clinical event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -553,7 +553,7 @@ dataset.first_asthma_diagnosis_date = clinical_events.where( ### What is the code and date of the first/last clinical event matching some criteria? -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -576,7 +576,7 @@ dataset.first_asthma_diagnosis_date = first_asthma_diagnosis.date ### Finding the code of the first medication after the first clinical event matching some criteria -```python +```ehrql from ehrql import create_dataset, codelist_from_csv, weeks from ehrql.tables.beta.core import clinical_events, medications @@ -603,7 +603,7 @@ dataset.count_ics_prescriptions_2wks_post_diagnosis = medications.where( ### Finding the mean observed value of clinical events matching some criteria -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -645,7 +645,7 @@ clinical_events.where(clinical_events.snomedct_code.is_in(hba1c_codelist) ### Finding events within a fixed date range -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -661,7 +661,7 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ### Finding events within a date range plus a constant -```python +```ehrql from ehrql import create_dataset, codelist_from_csv, weeks from ehrql.tables.beta.core import clinical_events @@ -679,7 +679,7 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ### Finding events within a dynamic date range -```python +```ehrql from ehrql import create_dataset, codelist_from_csv, months from ehrql.tables.beta.core import clinical_events @@ -704,7 +704,7 @@ dataset.count_of_hba1c_tests_6mo_post_first_diabetes_code = clinical_events.wher Data quality issues with many sources may result in events apparently happening in future dates (e.g. 9999-01-01), it is useful to filter these from your analysis. -```python +```ehrql from datetime import date from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -726,7 +726,7 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ### Finding the year an event occurred -```python +```ehrql from datetime import date from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events @@ -744,7 +744,7 @@ dataset.year_of_first = clinical_events.where( ### Finding prescriptions made in particular months of the year -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import medications @@ -762,7 +762,7 @@ dataset.winter_amoxicillin_count = medications.where( ### Finding the number of weeks between two events -```python +```ehrql from ehrql import create_dataset, codelist_from_csv from ehrql.tables.beta.core import clinical_events diff --git a/docs/index.md b/docs/index.md index 9bbde79aa..829770222 100644 --- a/docs/index.md +++ b/docs/index.md @@ -50,7 +50,7 @@ Slack channel. The following dataset definition selects the date and the code of each patient's most recent asthma medication, for all patients born on or before 31 December 1999. -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients, medications diff --git a/docs/tutorial/writing-a-dataset-definition/index.md b/docs/tutorial/writing-a-dataset-definition/index.md index 25eb92497..9b509b77f 100644 --- a/docs/tutorial/writing-a-dataset-definition/index.md +++ b/docs/tutorial/writing-a-dataset-definition/index.md @@ -2,7 +2,7 @@ In this section, you will write the following dataset definition. It selects the date and the code of each patient's most recent asthma medication, for all patients born on or before 31 December 1999. -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.core import patients, medications From 56e6a2ebc4ab585f9f0c4f0fc564f0460bf331b0 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Wed, 18 Oct 2023 12:32:52 +0100 Subject: [PATCH 07/13] docs: Fix table attribute names in examples --- docs/how-to/examples.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/how-to/examples.md b/docs/how-to/examples.md index c60b42b17..6180bdd8c 100644 --- a/docs/how-to/examples.md +++ b/docs/how-to/examples.md @@ -237,25 +237,25 @@ The meaning of this value is as follows: ### Finding each patient's MSOA -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import addresses dataset = create_dataset() -dataset.msoa = addresses.for_patient_on("2023-01-01").msoa +dataset.msoa_code = addresses.for_patient_on("2023-01-01").msoa_code ``` ### Finding multiple attributes of each patient's address -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import addresses dataset = create_dataset() address = addresses.for_patient_on("2023-01-01") -dataset.imd = address.imd +dataset.imd_rounded = address.imd_rounded dataset.rural_urban_classification = address.rural_urban_classification -dataset.msoa = address.msoa +dataset.msoa_code = address.msoa_code ``` ## Finding attributes related to each patient's GP practice as of a given date @@ -282,17 +282,17 @@ dataset.stp = practice_registrations.for_patient_on("2023-01-01").practice_stp ### Finding each patient's practice's region -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import practice_registrations dataset = create_dataset() -dataset.region = practice_registrations.for_patient_on("2023-01-01").nuts1_region_name +dataset.region = practice_registrations.for_patient_on("2023-01-01").practice_nuts1_region_name ``` ### Finding multiple attributes of each patient's practice -```python +```ehrql from ehrql import create_dataset from ehrql.tables.beta.tpp import practice_registrations @@ -300,7 +300,7 @@ dataset = create_dataset() registration = practice_registrations.for_patient_on("2023-01-01") dataset.practice = registration.practice_pseudo_id dataset.stp = registration.practice_stp -dataset.region = registration.nuts1_region_name +dataset.region = registration.practice_nuts1_region_name ``` ## Does each patient have an event matching some criteria? From f543a09f7097d0ddbcec947a5905aeb760ec7c96 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 26 Oct 2023 13:03:52 +0100 Subject: [PATCH 08/13] docs: Fix another example slightly This doesn't fail testing, but is vestigial as it's written: the variable `latest_ethnicity_group` isn't actually used. --- docs/how-to/examples.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how-to/examples.md b/docs/how-to/examples.md index 6180bdd8c..5fc4dec96 100644 --- a/docs/how-to/examples.md +++ b/docs/how-to/examples.md @@ -174,7 +174,7 @@ dataset.latest_ethnicity_code = ( .last_for_patient() .snomedct_code ) -latest_ethnicity_group = dataset.latest_ethnicity_code.to_category( +dataset.latest_ethnicity_group = dataset.latest_ethnicity_code.to_category( ethnicity_codelist ) ``` From 0f85f1fc742bafc45c81a93dda98a7244f89778c Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 26 Oct 2023 13:26:09 +0100 Subject: [PATCH 09/13] docs: Add `define_population()` to examples This makes these examples complete and working. --- docs/how-to/examples.md | 118 +++++++++++++++++++++++++++------------- 1 file changed, 79 insertions(+), 39 deletions(-) diff --git a/docs/how-to/examples.md b/docs/how-to/examples.md index 5fc4dec96..27640f062 100644 --- a/docs/how-to/examples.md +++ b/docs/how-to/examples.md @@ -54,6 +54,7 @@ from ehrql.tables.beta.core import patients dataset = create_dataset() dataset.age = patients.age_on("2023-01-01") +dataset.define_population(patients.exists_for_patient()) ``` Alternatively, using a native Python `date`: @@ -65,6 +66,7 @@ from ehrql.tables.beta.core import patients dataset = create_dataset() dataset.age = patients.age_on(date(2023, 1, 1)) +dataset.define_population(patients.exists_for_patient()) ``` Or using an `index_date` variable: @@ -76,6 +78,7 @@ from ehrql.tables.beta.core import patients index_date = "2023-01-01" dataset = create_dataset() dataset.age = patients.age_on(index_date) +dataset.define_population(patients.exists_for_patient()) ``` ### Assigning each patient an age band @@ -94,6 +97,7 @@ dataset.age_band = case( when(age >= 80).then("80+"), default="missing", ) +dataset.define_population(patients.exists_for_patient()) ``` ### Finding each patient's date of birth @@ -104,6 +108,7 @@ from ehrql.tables.beta.core import patients dataset = create_dataset() dataset.date_of_birth = patients.date_of_birth +dataset.define_population(patients.exists_for_patient()) ``` ### Finding each patient's date of death in their primary care record @@ -114,6 +119,7 @@ from ehrql.tables.beta.core import patients dataset = create_dataset() dataset.date_of_death = patients.date_of_death +dataset.define_population(patients.exists_for_patient()) ``` :notepad_spiral: This value comes from the patient's EHR record. @@ -127,13 +133,14 @@ By contrast, cause of death is often not accurate in the primary care record so ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.core import ons_deaths +from ehrql.tables.beta.core import ons_deaths, patients dataset = create_dataset() last_ons_death = ons_deaths.sort_by(ons_deaths.date).last_for_patient() dataset.date_of_death = last_ons_death.date dataset.place_of_death = last_ons_death.place dataset.cause_of_death = last_ons_death.cause_of_death_01 +dataset.define_population(patients.exists_for_patient()) ``` :notepad_spiral: There are currently [multiple](https://github.com/opensafely-core/ehrql/blob/d29ff8ab2cebf3522258c408f8225b7a76f7b6f2/ehrql/tables/beta/core.py#L78-L92) cause of death fields. We aim to resolve these to a single feature in the future. @@ -146,6 +153,7 @@ from ehrql.tables.beta.core import patients dataset = create_dataset() dataset.sex = patients.sex +dataset.define_population(patients.exists_for_patient()) ``` The possible values are "female", "male", "intersex", and "unknown". @@ -156,7 +164,7 @@ Ethnicity can be defined using a codelist. There are a lot of individual codes t ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients from ehrql import codelist_from_csv dataset = create_dataset() @@ -177,6 +185,7 @@ dataset.latest_ethnicity_code = ( dataset.latest_ethnicity_group = dataset.latest_ethnicity_code.to_category( ethnicity_codelist ) +dataset.define_population(patients.exists_for_patient()) ``` ## Finding attributes related to each patient's address as of a given date @@ -185,10 +194,11 @@ dataset.latest_ethnicity_group = dataset.latest_ethnicity_code.to_category( ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import addresses +from ehrql.tables.beta.tpp import addresses, patients dataset = create_dataset() dataset.imd = addresses.for_patient_on("2023-01-01").imd_rounded +dataset.define_population(patients.exists_for_patient()) ``` The original IMD ranking is rounded to the nearest 100. @@ -200,7 +210,7 @@ See [this code comment](https://github.com/opensafely-core/ehrql/blob/d29ff8ab2c ```ehrql from ehrql import create_dataset, case, when -from ehrql.tables.beta.tpp import addresses +from ehrql.tables.beta.tpp import addresses, patients dataset = create_dataset() imd = addresses.for_patient_on("2023-01-01").imd_rounded @@ -212,16 +222,18 @@ dataset.imd_quintile = case( when(imd < int(32844 * 5 / 5)).then("5 (least deprived)"), default="unknown" ) +dataset.define_population(patients.exists_for_patient()) ``` ### Finding each patient's rural/urban classification ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import addresses +from ehrql.tables.beta.tpp import addresses, patients dataset = create_dataset() dataset.rural_urban = addresses.for_patient_on("2023-01-01").rural_urban_classification +dataset.define_population(patients.exists_for_patient()) ``` The meaning of this value is as follows: @@ -239,23 +251,25 @@ The meaning of this value is as follows: ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import addresses +from ehrql.tables.beta.tpp import addresses, patients dataset = create_dataset() dataset.msoa_code = addresses.for_patient_on("2023-01-01").msoa_code +dataset.define_population(patients.exists_for_patient()) ``` ### Finding multiple attributes of each patient's address ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import addresses +from ehrql.tables.beta.tpp import addresses, patients dataset = create_dataset() address = addresses.for_patient_on("2023-01-01") dataset.imd_rounded = address.imd_rounded dataset.rural_urban_classification = address.rural_urban_classification dataset.msoa_code = address.msoa_code +dataset.define_population(patients.exists_for_patient()) ``` ## Finding attributes related to each patient's GP practice as of a given date @@ -264,43 +278,47 @@ dataset.msoa_code = address.msoa_code ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import practice_registrations +from ehrql.tables.beta.tpp import practice_registrations, patients dataset = create_dataset() dataset.practice = practice_registrations.for_patient_on("2023-01-01").practice_pseudo_id +dataset.define_population(patients.exists_for_patient()) ``` ### Finding each patient's practice's STP ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import practice_registrations +from ehrql.tables.beta.tpp import practice_registrations, patients dataset = create_dataset() dataset.stp = practice_registrations.for_patient_on("2023-01-01").practice_stp +dataset.define_population(patients.exists_for_patient()) ``` ### Finding each patient's practice's region ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import practice_registrations +from ehrql.tables.beta.tpp import practice_registrations, patients dataset = create_dataset() dataset.region = practice_registrations.for_patient_on("2023-01-01").practice_nuts1_region_name +dataset.define_population(patients.exists_for_patient()) ``` ### Finding multiple attributes of each patient's practice ```ehrql from ehrql import create_dataset -from ehrql.tables.beta.tpp import practice_registrations +from ehrql.tables.beta.tpp import practice_registrations, patients dataset = create_dataset() registration = practice_registrations.for_patient_on("2023-01-01") dataset.practice = registration.practice_pseudo_id dataset.stp = registration.practice_stp dataset.region = registration.practice_nuts1_region_name +dataset.define_population(patients.exists_for_patient()) ``` ## Does each patient have an event matching some criteria? @@ -309,7 +327,7 @@ dataset.region = registration.practice_nuts1_region_name ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -317,13 +335,14 @@ dataset = create_dataset() dataset.has_had_asthma_diagnosis = clinical_events.where( clinical_events.snomedct_code.is_in(asthma_codelist) ).exists_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Does each patient have a clinical event matching a code in a codelist in a time period? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -333,13 +352,14 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ).where( clinical_events.date.is_on_or_between("2022-07-01", "2023-01-01") ).exists_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Does each patient have a medication event matching some criteria? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import medications +from ehrql.tables.beta.core import medications, patients statin_medications = codelist_from_csv("XXX", column="YYY") @@ -349,13 +369,14 @@ dataset.has_recent_statin_prescription = medications.where( ).where( medications.date.is_on_or_between("2022-07-01", "2023-01-01") ).exists_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Does each patient have a hospitalisation event matching some criteria? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.tpp import hospital_admissions +from ehrql.tables.beta.tpp import hospital_admissions, patients cardiac_diagnosis_codes = codelist_from_csv("XXX", column="YYY") @@ -365,13 +386,14 @@ dataset.has_recent_cardiac_admission = hospital_admissions.where( ).where( hospital_admissions.admission_date.is_on_or_between("2022-07-01", "2023-01-01") ).exists_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ## How many events does each patient have matching some criteria? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import medications +from ehrql.tables.beta.core import medications, patients statin_medications = codelist_from_csv("XXX", column="YYY") @@ -381,6 +403,7 @@ dataset.number_of_statin_prescriptions_in_last_year = medications.where( ).where( medications.date.is_on_or_between("2022-01-01", "2023-01-01") ).count_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ## What is the first/last event matching some criteria? @@ -392,7 +415,7 @@ Frames can be sorted by calling the `sort_by()` method with the column to sort t ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -404,11 +427,12 @@ dataset.first_asthma_diagnosis_date = clinical_events.where( ).sort_by( clinical_events.date ).first_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -420,13 +444,14 @@ dataset.last_asthma_diagnosis_date = clinical_events.where( ).sort_by( clinical_events.date ).last_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ### What is the earliest/latest medication event matching some criteria? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import medications +from ehrql.tables.beta.core import medications, patients statin_medications = codelist_from_csv("XXX", column="YYY") @@ -438,11 +463,12 @@ dataset.first_statin_prescription_date = medications.where( ).sort_by( medications.date ).first_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import medications +from ehrql.tables.beta.core import medications, patients statin_medications = codelist_from_csv("XXX", column="YYY") @@ -454,13 +480,14 @@ dataset.last_statin_prescription_date = medications.where( ).sort_by( medications.date ).last_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ### What is the earliest/latest hospitalisation event matching some criteria? ```python from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.tpp import hospital_admissions +from ehrql.tables.beta.tpp import hospital_admissions, patients cardiac_diagnosis_codes = codelist_from_csv("XXX", column="YYY") @@ -472,11 +499,12 @@ dataset.first_cardiac_hospitalisation_date = hospital_admissions.where( ).sort_by( hospital_admissions.date ).first_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import medications +from ehrql.tables.beta.core import medications, patients cardiac_diagnosis_codes = codelist_from_csv("XXX", column="YYY") @@ -488,13 +516,14 @@ dataset.last_cardiac_hospitalisation_date = medications.where( ).sort_by( medications.date ).last_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ### What is the clinical event, matching some criteria, with the least/greatest value? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients hba1c_codelist = codelist_from_csv("XXX", column="YYY") @@ -511,6 +540,7 @@ dataset.date_of_max_hba1c_observed = clinical_events.where(clinical_events.snome ).sort_by( clinical_events.date ).last_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ## Getting properties of an event matching some criteria @@ -519,7 +549,7 @@ dataset.date_of_max_hba1c_observed = clinical_events.where(clinical_events.snome ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -531,13 +561,14 @@ dataset.first_asthma_diagnosis_date = clinical_events.where( ).sort_by( clinical_events.date ).first_for_patient().snomedct_code +dataset.define_population(patients.exists_for_patient()) ``` ### What is the date of the first/last clinical event matching some criteria? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -549,13 +580,14 @@ dataset.first_asthma_diagnosis_date = clinical_events.where( ).sort_by( clinical_events.date ).first_for_patient().date +dataset.define_population(patients.exists_for_patient()) ``` ### What is the code and date of the first/last clinical event matching some criteria? ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -569,7 +601,7 @@ first_asthma_diagnosis = clinical_events.where( ).first_for_patient() dataset.first_asthma_diagnosis_code = first_asthma_diagnosis.snomedct_code dataset.first_asthma_diagnosis_date = first_asthma_diagnosis.date - +dataset.define_population(patients.exists_for_patient()) ``` ## Finding events occuring close in time to another event @@ -578,7 +610,7 @@ dataset.first_asthma_diagnosis_date = first_asthma_diagnosis.date ```ehrql from ehrql import create_dataset, codelist_from_csv, weeks -from ehrql.tables.beta.core import clinical_events, medications +from ehrql.tables.beta.core import clinical_events, medications, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") inhaled_corticosteroid_codelist = codelist_from_csv("XXX", column="YYY") @@ -597,6 +629,7 @@ dataset.count_ics_prescriptions_2wks_post_diagnosis = medications.where( ).where( medications.date.is_on_or_between(first_asthma_diagnosis_date,first_asthma_diagnosis_date + weeks(2)) ).count_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ## Performing arithmetic on numeric values of clinical events @@ -605,7 +638,7 @@ dataset.count_ics_prescriptions_2wks_post_diagnosis = medications.where( ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients hba1c_codelist = codelist_from_csv("XXX", column="YYY") @@ -615,13 +648,14 @@ dataset.mean_hba1c = clinical_events.where( ).where( clinical_events.date.is_on_or_after("2022-07-01") ).numeric_value.mean_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Finding the observed value of clinical events matching some criteria expressed relative to another value ```python from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients hba1c_codelist = codelist_from_csv("XXX", column="YYY") @@ -639,6 +673,7 @@ clinical_events.where(clinical_events.snomedct_code.is_in(hba1c_codelist) ).sort_by( clinical_events.date ).numeric_value.mean_for_patient()) +dataset.define_population(patients.exists_for_patient()) ``` ## Finding events within a date range @@ -647,7 +682,7 @@ clinical_events.where(clinical_events.snomedct_code.is_in(hba1c_codelist) ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -657,13 +692,14 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ).where( clinical_events.date.is_on_or_between("2022-07-01", "2023-01-01") ).exists_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Finding events within a date range plus a constant ```ehrql from ehrql import create_dataset, codelist_from_csv, weeks -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -675,13 +711,14 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ).where( clinical_events.date.is_on_or_between(index_date, index_date + weeks(2)) ).exists_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Finding events within a dynamic date range ```ehrql from ehrql import create_dataset, codelist_from_csv, months -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients diabetes_codelist = codelist_from_csv("XXX", column="YYY") hba1c_codelist = codelist_from_csv("XXX", column="YYY") @@ -698,6 +735,7 @@ dataset.count_of_hba1c_tests_6mo_post_first_diabetes_code = clinical_events.wher ).where( clinical_events.date.is_on_or_between(first_diabetes_code_date, first_diabetes_code_date + months(6)) ).count_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Excluding events which have happened in the future @@ -707,7 +745,7 @@ Data quality issues with many sources may result in events apparently happening ```ehrql from datetime import date from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -719,7 +757,7 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ).where( clinical_events.date < date.today() ).exists_for_patient() - +dataset.define_population(patients.exists_for_patient()) ``` ## Extracting parts of dates and date differences @@ -729,7 +767,7 @@ dataset.has_recent_asthma_diagnosis = clinical_events.where( ```ehrql from datetime import date from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") @@ -739,14 +777,14 @@ dataset.year_of_first = clinical_events.where( ).sort_by( clinical_events.date ).first_for_patient().date.year - +dataset.define_population(patients.exists_for_patient()) ``` ### Finding prescriptions made in particular months of the year ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import medications +from ehrql.tables.beta.core import medications, patients amoxicillin_codelist = codelist_from_csv("XXX", column="YYY") @@ -758,13 +796,14 @@ dataset.winter_amoxicillin_count = medications.where( ).where( medications.date.month.is_in(winter_months) ).count_for_patient() +dataset.define_population(patients.exists_for_patient()) ``` ### Finding the number of weeks between two events ```ehrql from ehrql import create_dataset, codelist_from_csv -from ehrql.tables.beta.core import clinical_events +from ehrql.tables.beta.core import clinical_events, patients asthma_codelist = codelist_from_csv("XXX", column="YYY") asthma_review_codelist = codelist_from_csv("XXX", column="YYY") @@ -781,4 +820,5 @@ first_asthma_review_date = clinical_events.where( ).sort_by(clinical_events.date).first_for_patient().date dataset.weeks_between_diagnosis_and_review = (first_asthma_review_date - first_asthma_diagnosis_date).weeks +dataset.define_population(patients.exists_for_patient()) ``` From a4f5481eb023f81e54f9e4ac303069b42aed8416 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 26 Oct 2023 13:27:04 +0100 Subject: [PATCH 10/13] docs: Explain `define_population` in examples Briefly; to give the lines more context. It is less likely a real user would want to do what we've done to satisfy the tests. But it does make the examples simpler, than adding arbitrary population constraints. --- docs/how-to/examples.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/how-to/examples.md b/docs/how-to/examples.md index 27640f062..9804981ef 100644 --- a/docs/how-to/examples.md +++ b/docs/how-to/examples.md @@ -8,6 +8,18 @@ to see a list of the examples, and then jump to a specific example of interest. +## Understanding these examples + +### The populations defined with `define_population()` + +In each of these examples, +we specify that the population is **all patients** +via `dataset.define_population(patients.exists_for_patient())`. + +In practice, +you will likely want to adapt an example to filter to a specific population of interest. +Refer to the [`define_population()` documentation](https://docs.opensafely.org/ehrql/reference/language/#Dataset.define_population). + ### Some examples using `codelist_from_csv()` :warning: Some examples refer to CSV codelists using the From 01dbf8636b043aeb0bb8a98e36f903444bb23de2 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 17 Oct 2023 13:59:31 +0100 Subject: [PATCH 11/13] docs: Add docs testing info to `DEVELOPERS.md` Briefly document how the documentation examples are tested. --- DEVELOPERS.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 5eafb7c9d..7e86cca8a 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -26,6 +26,7 @@ Tests are divided into the following categories.
acceptance
tests which demonstrate how ehrQL is used and check compatibility with real studies
integration
tests of detailed code logic that require a database
docker
tests of the ehrQL docker image
+
docs
tests of the documentation examples
Each category lives in its own directory (for example `tests/unit`) and has its own `just` command to run it (for @@ -299,6 +300,52 @@ generated markdown files. It is a developer's responsibility to update the gener their PR if required. There is also a CI step that will check that the documentation is up to date. +### Testing dataset definitions included in the documentation + +All of the example tests can be run with: + + just test-docs-examples + +* Examples to be tested run with `generate_dataset()`. +* Dataset definitions may be included inline in Markdown files in `docs/`, + labelled as code blocks with the `ehrql` syntax, + or as Python `.py` files in `docs/`. + +#### Examples using `codelist_from_csv()` + +For testing examples, +`codelist_from_csv()` is currently patched out to work without any CSV, +nor are codelist codes validated. + +The function signature of `codelist_from_csv()` calls from examples *is* checked. + +This may be improved in future to make the testing more rigorous; +see #1694. + +#### Inline code blocks (Markdown fences) + +Examples in the documentation Markdown source will be tested as part of the test suite +if you place complete examples in a code block with the `ehrql` syntax label: `` ```ehrql `` + +This will still highlight the code as if it were Python. + +:warning: The `ehrql` syntax label is for inline and complete ehrQL blocks only. + +We use the SuperFences extension for extracting Markdown fences. +Refer to the [SuperFences documentation](https://facelessuser.github.io/pymdown-extensions/extensions/superfences/#nested-fence-format) for more details of the fence format. + +#### Dataset definitions as included Python files + +Python files in the `docs/` directory are assumed to be working dataset definitions. + +They are also tested in the test suite. + +If included in the documentation using the snippet syntax, +they must be used with a `python` syntax label. +(If they were labelled as `ehrql`, +the snippet line itself would be extracted from the Markdown, +and treated as a dataset definition.) + ### Updating the main OpenSAFELY documentation repository Merges to the main branch in this repo trigger a [deployment of the main OpenSAFELY documentation via a Github Action](https://github.com/opensafely-core/ehrql/actions/workflows/deploy-documentation.yml). From 39285bd5f6a39feb742407a81774977734fe3d98 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Thu, 2 Nov 2023 20:22:37 +0000 Subject: [PATCH 12/13] chore: Remove type annotations In code review, it was asked why these were included. The guidelines in `DEVELOPERS.md` say: > And developers should feel free to use them wherever this aids clarity > vs a docstring or a comment. I personally find them useful, but if that's the wider preference, then this is OK. --- tests/docs/test_complete_examples.py | 45 ++++++++++------------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/tests/docs/test_complete_examples.py b/tests/docs/test_complete_examples.py index 48247dddb..412196ada 100644 --- a/tests/docs/test_complete_examples.py +++ b/tests/docs/test_complete_examples.py @@ -1,9 +1,7 @@ import csv import inspect -import typing import unittest.mock import uuid -from collections.abc import Generator, Iterator from dataclasses import dataclass from pathlib import Path @@ -28,23 +26,18 @@ class MarkdownFenceExtractor: See https://facelessuser.github.io/pymdown-extensions/extensions/superfences/ """ - def __init__(self, content: str) -> None: - self.fences: list[MarkdownFence] = [] - self.extension_configs: typing.Mapping[ - str, typing.Any - ] = self._configure_superfences() + def __init__(self, content): + self.fences = [] + self.extension_configs = self._configure_superfences() self._extract_fences(content) def _fence_null_format( - # Argument types are taken from mkdocs-code-validator, - # which has an MIT license, - # but they will be removed before merge. self, - src: str, - language: str, - css_class: str | None, - options: typing.Mapping[str, typing.Any], - md: markdown.Markdown, + src, + language, + css_class, + options, + md, **kwargs, ) -> str: """Extract the fences in the same way @@ -80,7 +73,7 @@ def _configure_superfences(self): ] return config["mdx_configs"] - def _extract_fences(self, content: str) -> None: + def _extract_fences(self, content): markdown.Markdown( extensions=["pymdownx.superfences"], extension_configs=self.extension_configs, @@ -101,22 +94,20 @@ class DatasetDefinitionExample: fence_number: int | None source: str - def relative_path(self) -> Path: + def relative_path(self): """Return the relative path of the dataset definition source file to the source code root.""" source_code_path = Path(__file__).parents[2] return self.path.relative_to(source_code_path) -def discover_paths(glob_string: str) -> Generator[Path, None, None]: +def discover_paths(glob_string): """Generate a list of matching files for a glob in the documentation source path.""" docs_path = Path(__file__).parents[2] / "docs" return docs_path.glob(glob_string) -def find_complete_ehrql_examples_in_markdown( - file: typing.TextIO, -) -> Iterator[DatasetDefinitionExample]: +def find_complete_ehrql_examples_in_markdown(file): """Yields extracted code blocks labelled as ```ehrql from a Markdown file. Incomplete ehrQL dataset definitions should be labelled as ```python, @@ -133,9 +124,7 @@ def find_complete_ehrql_examples_in_markdown( yield example -def generate_complete_ehrql_examples() -> ( - Generator[DatasetDefinitionExample, None, None] -): +def generate_complete_ehrql_examples(): """Yields all complete ehrQL DatasetDefinitionExamples from the Markdown documentation.""" markdown_paths = list(discover_paths("**/*.md")) assert len(markdown_paths) > 0, "No Markdown files found" @@ -158,7 +147,7 @@ def generate_complete_ehrql_examples() -> ( ) -def create_example_test_case_id(example: DatasetDefinitionExample) -> str: +def create_example_test_case_id(example): """Returns a test case ID for pytest from a specific DatasetDefinitionExample.""" test_id = f"{example.relative_path()}" if example.fence_number is not None: @@ -166,7 +155,7 @@ def create_example_test_case_id(example: DatasetDefinitionExample) -> str: return test_id -def validate_dataset_output(dataset_path: Path) -> None: +def validate_dataset_output(dataset_path): """Validates that an output dataset file is a CSV.""" with open(dataset_path) as f: csv_content = f.readlines() @@ -189,9 +178,7 @@ class DatasetDefinitionTestError(Exception): generate_complete_ehrql_examples(), ids=create_example_test_case_id, ) -def test_ehrql_generate_dataset_example( - tmp_path: Path, example: DatasetDefinitionExample -) -> None: +def test_ehrql_generate_dataset_example(tmp_path, example): tmp_filename_base = str(uuid.uuid4()) tmp_dataset_definition_path = tmp_path / (tmp_filename_base + ".py") From ec7f61e5aa07120a31247b0fba006e7523e7d9e5 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 7 Nov 2023 11:32:37 +0000 Subject: [PATCH 13/13] refactor: Retrieve MkDocs configuration once In the documentation example tests. This avoids running the code multiple times, once for each test, and removes the possibility of strange things happening if the configuration file changes between parametrized tests. --- tests/docs/test_complete_examples.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/tests/docs/test_complete_examples.py b/tests/docs/test_complete_examples.py index 412196ada..54b681200 100644 --- a/tests/docs/test_complete_examples.py +++ b/tests/docs/test_complete_examples.py @@ -1,3 +1,4 @@ +import copy import csv import inspect import unittest.mock @@ -12,6 +13,21 @@ import ehrql.main +def get_markdown_extension_configuration(): + """Returns a dictionary representing the mkdocs.yml Markdown extension + configuration. + + It should only be required to run this function once, + at the start of these tests.""" + config_path = Path(__file__).parents[2] / "mkdocs.yml" + config = mkdocs.config.load_config(config_file_path=str(config_path)) + assert "pymdownx.superfences" in config["markdown_extensions"] + return config["mdx_configs"] + + +MARKDOWN_EXTENSION_CONFIGURATION = get_markdown_extension_configuration() + + @dataclass class MarkdownFence: """Represents a Markdown fence.""" @@ -60,18 +76,18 @@ def _configure_superfences(self): """Retrieves the existing extensions settings from the mkdocs.yml configuration, replacing any custom SuperFences fences with a special test custom fence to extract all fences.""" - config_path = Path(__file__).parents[2] / "mkdocs.yml" - config = mkdocs.config.load_config(config_file_path=str(config_path)) - assert "pymdownx.superfences" in config["markdown_extensions"] - config["mdx_configs"]["pymdownx.superfences"]["custom_fences"] = [ + config = copy.deepcopy(MARKDOWN_EXTENSION_CONFIGURATION) + config["pymdownx.superfences"]["custom_fences"] = [ { # "name" specifies fences to extract. + # "*" indicates fences unhandled by other custom fences; + # as we have no other custom fences, "*" processes all fences. "name": "*", "class": "test", "format": self._fence_null_format, }, ] - return config["mdx_configs"] + return config def _extract_fences(self, content): markdown.Markdown(