diff --git a/interactive_templates/templates/v2/analysis/code_variables.py b/interactive_templates/templates/v2/analysis/code_variables.py new file mode 100644 index 00000000..3a3cc03a --- /dev/null +++ b/interactive_templates/templates/v2/analysis/code_variables.py @@ -0,0 +1,43 @@ +from cohortextractor import codelist, patients + + +def generate_code_variables( + code_list_1, codelist_1_type, code_list_2, codelist_2_type, start_date, end_date +): + def make_variable(code, codelist_type, start_date, end_date): + if codelist_type == "event": + return { + f"count_{code}": ( + patients.with_these_clinical_events( + codelist([code], system="snomed"), + between=[start_date, end_date], + returning="number_of_matches_in_period", + return_expectations={ + "incidence": 0.1, + "int": {"distribution": "normal", "mean": 3, "stddev": 1}, + }, + ) + ) + } + elif codelist_type == "medication": + return { + f"count_{code}": ( + patients.with_these_medications( + codelist([code], system="snomed"), + between=[start_date, end_date], + returning="number_of_matches_in_period", + return_expectations={ + "incidence": 0.1, + "int": {"distribution": "normal", "mean": 3, "stddev": 1}, + }, + ) + ) + } + + variables = {} + for code in code_list_1: + variables.update(make_variable(code, codelist_1_type, start_date, end_date)) + + for code in code_list_2: + variables.update(make_variable(code, codelist_2_type, start_date, end_date)) + return variables diff --git a/interactive_templates/templates/v2/analysis/event_variables.py b/interactive_templates/templates/v2/analysis/event_variables.py index 6511c00b..9db45d35 100644 --- a/interactive_templates/templates/v2/analysis/event_variables.py +++ b/interactive_templates/templates/v2/analysis/event_variables.py @@ -1,5 +1,4 @@ from cohortextractor import patients -from report_utils import generate_expectations_codes def clinical_event(codelist, date_range, event_name, ever=False): @@ -27,17 +26,6 @@ def clinical_event(codelist, date_range, event_name, ever=False): return_expectations={"incidence": 0.5}, ) ), - f"{event_name}_code": ( - patients.with_these_clinical_events( - codelist=codelist, - **date_kwargs, - returning="code", - return_expectations={ - "rate": "universal", - "category": {"ratios": generate_expectations_codes(codelist)}, - }, - ) - ), f"{event_name}_date": ( patients.with_these_clinical_events( codelist=codelist, @@ -82,17 +70,6 @@ def medication_event(codelist, date_range, event_name, ever=False): return_expectations={"incidence": 0.5}, ) ), - f"{event_name}_code": ( - patients.with_these_medications( - codelist=codelist, - **date_kwargs, - returning="code", - return_expectations={ - "rate": "universal", - "category": {"ratios": generate_expectations_codes(codelist)}, - }, - ) - ), f"{event_name}_date": ( patients.with_these_medications( codelist=codelist, diff --git a/interactive_templates/templates/v2/analysis/measures.py b/interactive_templates/templates/v2/analysis/measures.py index a2449974..c0d1b75e 100644 --- a/interactive_templates/templates/v2/analysis/measures.py +++ b/interactive_templates/templates/v2/analysis/measures.py @@ -164,7 +164,7 @@ def main(): args = parse_args() breakdowns = args.breakdowns - breakdowns.extend(["practice", "event_1_code", "event_2_code"]) + breakdowns.extend(["practice"]) measure_df = pd.DataFrame( columns=["date", "event_measure", "population", "group", "group_value"] diff --git a/interactive_templates/templates/v2/analysis/report_template.html b/interactive_templates/templates/v2/analysis/report_template.html index 4442aba9..d69e6d0c 100644 --- a/interactive_templates/templates/v2/analysis/report_template.html +++ b/interactive_templates/templates/v2/analysis/report_template.html @@ -52,7 +52,7 @@

Measure description

This measure is calculated for {{ population }} using the OpenSAFELY-TPP - dataset, which covers ~40% of England. The monthly rate of this measure is shown below. + dataset, which covers ~40% of England. The monthly rate per 1000 patients of this measure is shown below. {% if breakdowns|length >0 %} A breakdown of this measure by {% for b in breakdowns %} @@ -102,7 +102,7 @@

Analysis limitations

  • If the chosen codelists represent vaccinations, referrals, maternity records, - or activities not directly occurring in primary care , note that some events + or activities not directly occurring in primary care, note that some events may be recorded in way that are not fully captured by this analysis, and/or are subject to limited recording in primary care.
  • @@ -119,27 +119,28 @@

    Analysis limitations

    Measure summary

    The table below shows the total number of times the measure of interest - occurred and the number of unique patients experiencing the event between - {{ start_date }} and {{ end_date }}. The number of events in the latest - complete month and latest week is also shown. An event is defined as a + occurred and the number of unique patients experiencing with the measure between + {{ start_date }} and {{ end_date }}. The number of patients with the measure of interest + in the latest complete month and latest week is also shown. The measure is defined as a patient having a code recorded from both codelist 1 and codelist 2 that - satisfies the specified measure logic. A patient can have multiple events, - but a maximum of 1 event per patient is counted each month. + satisfies the specified measure logic. The measure can be satisfied multiple times by + the same patients, but here maximum of 1 instance per patient is counted each month.

    - - - - - + + + + + @@ -161,12 +162,12 @@

    Population level rate

    patients for the measure described above.

    - {{ display_image(population_plot.path, population_plot.data) }}
    Figure 1. The monthly rate per 1000 registered patients in the selected population for the specified measure between {{ start_date }} and {{ end_date }}.
    + {{ display_image(population_plot.path, population_plot.data) }}
    @@ -180,7 +181,8 @@

    Practice level variation

    For each month, the practices are then ranked by their rate. From this, the median (5th decile) practice level rate is calculated as well as the rate for the 10th percentile etc. The wider the gap between the deciles, the more practice level - variability there is. You can read more about how we use deciles here. + variability there is. You can read more about how we use deciles + here.

    @@ -193,12 +195,12 @@

    Practice level variation

    - {{ display_image(decile.path, decile.data) }}
    Figure 2. Practice level decile chart showing practice level variation in the rate per 1000 registered patients who satisfy the specified measure between {{ start_date }} and {{ end_date }}.
    + {{ display_image(decile.path, decile.data) }}
    @@ -208,9 +210,8 @@

    Most common codes

    The tables below show the most common codes recorded within both the {{ codelist_1_name }} and the {{ codelist_2_name }} codelists. For each code within each codelist, the number of times it was recorded in the period between {{ start_date }} and {{ end_date }} is calculated. For each code, the percentage - makeup of the total number of events is then calculated. Note that the code recorded is the latest code - recorded for a given patient in the given period. Where a patient has had multiple codes recorded from - the specified codelists, only the latest code will contribute to the counts in this table. + makeup is then calculated. Note that this includes instances of these codes + that may not contribute to the measure of interest.

    Table 1. Summary table showing the total number of patients who meet the measure criteria at some point during the study period, the total - number of event throughout the study period (up to one event, per patient, per month), - and the number of events in the latest complete month and complete week. + number of instances the measure is recorded throughout the study period (up to one + instance, per patient, per month), and the number of times the measure was recorded + in the latest complete month and complete week.
    Total eventsTotal patientsTotal patients with eventsEvents in latest month ({{ summary_table_data.latest_month }})Events in latest week ({{ summary_table_data.latest_week }})Total instances of measureTotal patients in study populationTotal patients with measurePatients with measure in latest month ({{ summary_table_data.latest_month }})Patients with measure in latest week ({{ summary_table_data.latest_week }})
    @@ -292,13 +293,13 @@

    Breakdown by {{ b.title }}

    {% if b.figure.exists %}
    - {{ display_image(b.figure.path, b.figure.data) }}
    Figure {{ i.value }}. The rate per 1000 patients in the selected population for the measure of interest, broken down by {{ b.title }}
    + {{ display_image(b.figure.path, b.figure.data) }} {% set i.value = i.value +1 %}
    {% else %} diff --git a/interactive_templates/templates/v2/analysis/report_utils.py b/interactive_templates/templates/v2/analysis/report_utils.py index 519c62b2..6a1eed42 100644 --- a/interactive_templates/templates/v2/analysis/report_utils.py +++ b/interactive_templates/templates/v2/analysis/report_utils.py @@ -45,16 +45,6 @@ def relabel_sex(df): return df -def generate_expectations_codes(codelist, incidence=0.5): - if len(codelist) >= 10: - expectations = {str(x): (1 - incidence) / 10 for x in codelist[0:10]} - else: - expectations = {str(x): (1 - incidence) / len(codelist) for x in codelist} - - expectations[None] = incidence - return expectations - - def save_to_json(d, filename: str): """Saves dictionary to json file""" with open(filename, "w") as f: @@ -106,7 +96,7 @@ def plot_measures( if category: df[category] = df[category].fillna("Missing") - _, ax = plt.subplots(figsize=(15, 8)) + _, ax = plt.subplots(figsize=(15, 12)) palette = sns.color_palette("tab10") if category: @@ -121,6 +111,7 @@ def plot_measures( palette=palette, ax=ax, label=unique_category, + linewidth=1.5, ) else: @@ -134,10 +125,11 @@ def plot_measures( palette=palette, ax=ax, label=unique_category, + linewidth=1.5, ) else: - ax.plot(df["date"], df[column_to_plot]) + ax.plot(df["date"], df[column_to_plot], linewidth=1.5) ax.set( ylabel=y_label, @@ -146,7 +138,7 @@ def plot_measures( 0, 1000 if df[column_to_plot].isnull().values.all() - else df[column_to_plot].max(), + else df[column_to_plot].max() * 1.1, ), ) @@ -158,9 +150,10 @@ def plot_measures( if category: ax.legend( - bbox_to_anchor=(1.04, 1), - loc="upper left", + bbox_to_anchor=(0.5, 1.2), + loc="upper center", fontsize=20, + ncol=len(df[category]) if len(df[category]) < 4 else 4, ) ax.margins(x=0) @@ -263,7 +256,7 @@ def deciles_chart(df, filename, period_column=None, column=None, title="", ylabe ylabel: the label of the y-axis of the chart """ - fig, ax = plt.subplots(figsize=(15, 8)) + fig, ax = plt.subplots(figsize=(15, 12)) linestyles = { "decile": {"line": "b--", "linewidth": 1, "label": "Decile"}, @@ -325,12 +318,13 @@ def deciles_chart(df, filename, period_column=None, column=None, title="", ylabe plt.xticks(sorted(df[period_column].unique()), rotation=90) ax.xaxis.set_major_locator(mdates.MonthLocator(interval=2)) ax.legend( - bbox_to_anchor=(1.1, 0.8), - loc="center left", - ncol=1, + bbox_to_anchor=(0.5, 1.2), + loc="upper center", + ncol=3, fontsize=20, borderaxespad=0.0, ) + plt.tight_layout() plt.savefig(filename) plt.clf() diff --git a/interactive_templates/templates/v2/analysis/study_definition.py b/interactive_templates/templates/v2/analysis/study_definition.py index 47473621..879d4d31 100644 --- a/interactive_templates/templates/v2/analysis/study_definition.py +++ b/interactive_templates/templates/v2/analysis/study_definition.py @@ -110,6 +110,7 @@ ), ) + measures = [ Measure( id="event_rate", @@ -117,18 +118,6 @@ denominator="population", group_by=["practice"], ), - Measure( - id="event_code_1_rate", - numerator="event_1", - denominator="population", - group_by=["event_1_code"], - ), - Measure( - id="event_code_2_rate", - numerator="event_2", - denominator="population", - group_by=["event_2_code"], - ), ] if breakdowns: diff --git a/interactive_templates/templates/v2/analysis/study_definition_ethnicity.py b/interactive_templates/templates/v2/analysis/study_definition_end.py similarity index 78% rename from interactive_templates/templates/v2/analysis/study_definition_ethnicity.py rename to interactive_templates/templates/v2/analysis/study_definition_end.py index a3eb9b4a..0ff4b3c5 100644 --- a/interactive_templates/templates/v2/analysis/study_definition_ethnicity.py +++ b/interactive_templates/templates/v2/analysis/study_definition_end.py @@ -1,3 +1,4 @@ +from code_variables import generate_code_variables from cohortextractor import StudyDefinition, codelist_from_csv, patients from config import CONFIG @@ -9,9 +10,22 @@ category_column="Grouping_6", ) - +codelist_1_path = CONFIG["codelist_1"]["path"] +codelist_1_type = CONFIG["codelist_1"]["type"] +codelist_2_path = CONFIG["codelist_2"]["path"] +codelist_2_type = CONFIG["codelist_2"]["type"] +start_date = CONFIG["start_date"] end_date = CONFIG["end_date"] +codelist_1 = codelist_from_csv(codelist_1_path, system="snomed", column="code") + +codelist_2 = codelist_from_csv( + codelist_2_path, + system="snomed", + column="code", +) + + study = StudyDefinition( index_date=end_date, default_expectations={ @@ -64,4 +78,7 @@ }, ), ), + **generate_code_variables( + codelist_1, codelist_1_type, codelist_2, codelist_2_type, start_date, end_date + ), ) diff --git a/interactive_templates/templates/v2/analysis/top_5.py b/interactive_templates/templates/v2/analysis/top_5.py index 80e580ce..7a403644 100644 --- a/interactive_templates/templates/v2/analysis/top_5.py +++ b/interactive_templates/templates/v2/analysis/top_5.py @@ -66,64 +66,88 @@ def round_values(x, base=5): return rounded -def create_top_5_code_table( - df, code_df, code_column, term_column, low_count_threshold, rounding_base, nrows=5 -): - """Creates a table of the top 5 codes recorded with the number of events and % makeup of each code. - Args: - df: A measure table. - code_df: A codelist table. - code_column: The name of the code column in the codelist table. - term_column: The name of the term column in the codelist table. - measure: The measure ID. - low_count_threshold: Value to use as threshold for disclosure control. - rounding_base: Base to round to. - nrows: The number of rows to display. - Returns: - A table of the top `nrows` codes. - """ - - event_counts = group_low_values(df, "num", code_column, low_count_threshold) - - # round - +def apply_rounding(event_counts, rounding_base): event_counts["num"] = event_counts["num"].apply( lambda x: round_values(x, rounding_base) ) + return event_counts + - # calculate % makeup of each code +def calculate_proportion(event_counts): total_events = event_counts["num"].sum() - event_counts["Proportion of codes (%)"] = round( - (event_counts["num"] / total_events) * 100, 2 - ) - # Gets the human-friendly description of the code for the given row - # e.g. "Systolic blood pressure". + # ensure total events is not 0 + if total_events == 0: + event_counts["Proportion of codes (%)"] = np.nan + + else: + event_counts["Proportion of codes (%)"] = round( + (event_counts["num"] / total_events) * 100, 2 + ) + return event_counts + + +def add_description(event_counts, code_df, code_column, term_column): + if code_df.empty: + event_counts["Description"] = "-" + return event_counts + code_df = code_df.set_index(code_column).rename( columns={term_column: "Description"} ) event_counts = event_counts.set_index(code_column).join(code_df).reset_index() - - # set description of "Other column" to something readable event_counts.loc[event_counts[code_column] == "Other", "Description"] = "-" - # Rename the code column to something consistent + # For codes that did not find a match in code_df set a default value + event_counts["Description"].fillna("-", inplace=True) + + return event_counts + + +def handle_edge_case_percentages(event_counts): + total_events = event_counts["num"].sum() + + zero_condition = (event_counts["Proportion of codes (%)"] == 0) & ( + event_counts["num"] > 0 + ) + hundred_condition = (event_counts["Proportion of codes (%)"] == 100) & ( + event_counts["num"] < total_events + ) + + event_counts.loc[zero_condition, "Proportion of codes (%)"] = "<0.001" + event_counts.loc[hundred_condition, "Proportion of codes (%)"] = ">99.99" + + return event_counts + + +def create_top_5_code_table( + df, code_df, code_column, term_column, low_count_threshold, rounding_base, nrows=5 +): + """Creates a table of the top 5 codes recorded with the number of events and % makeup of each code.""" + + event_counts = group_low_values(df, "num", code_column, low_count_threshold) + event_counts = event_counts.copy() + + event_counts = apply_rounding(event_counts, rounding_base) + event_counts = calculate_proportion(event_counts) + event_counts = add_description(event_counts, code_df, code_column, term_column) event_counts.rename(columns={code_column: "Code"}, inplace=True) + event_counts = handle_edge_case_percentages(event_counts) - # sort by proportion of codes - event_counts = event_counts.sort_values( + event_counts_sorted = event_counts.sort_values( ascending=False, by="Proportion of codes (%)" ) - event_counts_with_counts = event_counts.copy() - - # drop events column - event_counts = event_counts.loc[ + event_counts_with_counts = event_counts_sorted.copy() + event_counts = event_counts_sorted.loc[ :, ["Code", "Description", "Proportion of codes (%)"] ] - # return top n rows - return event_counts.head(5), event_counts_with_counts + event_counts_with_counts = event_counts_with_counts.loc[ + :, ["Code", "num", "Description", "Proportion of codes (%)"] + ] + + return event_counts.head(nrows), event_counts_with_counts def parse_args(): @@ -147,56 +171,43 @@ def main(): args = parse_args() codelist_1_path = args.codelist_1_path codelist_2_path = args.codelist_2_path - measure_df = pd.read_csv(args.output_dir / "measure_all.csv") - - code_df = measure_df.loc[measure_df["group"] == "event_1_code", :] - codelist = pd.read_csv(codelist_1_path, dtype={"code": str}) - - events_per_code = ( - code_df.groupby("group_value")[["event_measure"]].sum().reset_index() - ) - events_per_code.columns = ["code", "num"] - - top_5_code_table, top_5_code_table_with_counts = create_top_5_code_table( - df=events_per_code, - code_df=codelist, - code_column="code", - term_column="term", - low_count_threshold=7, - rounding_base=7, - ) - top_5_code_table.to_csv(args.output_dir / "top_5_code_table_1.csv", index=False) Path(args.output_dir / "for_checking").mkdir(parents=True, exist_ok=True) - top_5_code_table_with_counts.to_csv( - args.output_dir / "for_checking/top_5_code_table_with_counts_1.csv", - index=False, - ) - - code_df_2 = measure_df.loc[measure_df["group"] == "event_2_code", :] - - # TODO: support vpids? - codelist_2 = pd.read_csv(codelist_2_path, dtype={"code": str}) - events_per_code = ( - code_df_2.groupby("group_value")[["event_measure"]].sum().reset_index() - ) - events_per_code.columns = ["code", "num"] - - top_5_code_table, top_5_code_table_with_counts = create_top_5_code_table( - df=events_per_code, - code_df=codelist_2, - code_column="code", - term_column="term", - low_count_threshold=7, - rounding_base=7, - ) + input_df = pd.read_feather(args.output_dir / "input_end.feather") - top_5_code_table.to_csv(args.output_dir / "top_5_code_table_2.csv", index=False) - top_5_code_table_with_counts.to_csv( - args.output_dir / "for_checking" / "top_5_code_table_with_counts_2.csv", - index=False, - ) + use_cols = [col for col in input_df.columns if col.startswith("count")] + [ + "patient_id" + ] + input_df = input_df.loc[:, use_cols] + code_counts = input_df.sum().reset_index() + code_counts.columns = ["code", "num"] + + code_counts["code"] = code_counts["code"].str.replace("count_", "") + + for codelist_path in [codelist_1_path, codelist_2_path]: + codelist = pd.read_csv(codelist_path, dtype={"code": str}) + codes = codelist["code"].to_list() + code_counts_subset = code_counts.loc[code_counts["code"].isin(codes), :] + + top_5_code_table, top_5_code_table_with_counts = create_top_5_code_table( + df=code_counts_subset, + code_df=codelist, + code_column="code", + term_column="term", + low_count_threshold=7, + rounding_base=7, + ) + codelist_number = codelist_path.split("/")[-1].split(".")[0].split("_")[-1] + top_5_code_table.to_csv( + args.output_dir / f"top_5_code_table_{codelist_number}.csv", index=False + ) + + top_5_code_table_with_counts.to_csv( + args.output_dir + / f"for_checking/top_5_code_table_with_counts_{codelist_number}.csv", + index=False, + ) if __name__ == "__main__": diff --git a/interactive_templates/templates/v2/project.yaml.tmpl b/interactive_templates/templates/v2/project.yaml.tmpl index b334b947..8dde9f2b 100644 --- a/interactive_templates/templates/v2/project.yaml.tmpl +++ b/interactive_templates/templates/v2/project.yaml.tmpl @@ -5,13 +5,13 @@ expectations: actions: - generate_study_population_ethnicity_{{ id }}: + generate_study_population_end_{{ id }}: run: cohortextractor:latest generate_cohort - --study-definition study_definition_ethnicity + --study-definition study_definition_end --output-dir output/{{ id }} --output-format=feather outputs: highly_sensitive: - cohort: output/{{ id }}/input_ethnicity.feather + cohort: output/{{ id }}/input_end.feather generate_study_population_weekly_{{ id }}: run: cohortextractor:latest generate_cohort @@ -40,9 +40,9 @@ actions: run: > cohort-joiner:v0.0.38 --lhs output/{{ id }}/input_20*.feather - --rhs output/{{ id }}/input_ethnicity.feather + --rhs output/{{ id }}/input_end.feather --output-dir output/{{ id }}/joined - needs: [generate_study_population_{{ id }}, generate_study_population_ethnicity_{{ id }}] + needs: [generate_study_population_{{ id }}, generate_study_population_end_{{ id }}] outputs: highly_sensitive: cohort: output/{{ id }}/joined/input_20*.feather @@ -68,7 +68,7 @@ actions: --codelist-1-path="{{ codelist_1.path }}" --codelist-2-path="{{ codelist_2.path }}" --output-dir="output/{{ id }}" - needs: [generate_measures_{{ id }}] + needs: [generate_study_population_end_{{ id }}] outputs: moderately_sensitive: table_1: output/{{ id }}/top_5_code_table_1.csv diff --git a/interactive_templates/templates/v2/tests/test_top_5.py b/interactive_templates/templates/v2/tests/test_top_5.py new file mode 100644 index 00000000..a0e63f07 --- /dev/null +++ b/interactive_templates/templates/v2/tests/test_top_5.py @@ -0,0 +1,198 @@ +import numpy as np +import pandas as pd +from analysis.top_5 import ( + add_description, + apply_rounding, + calculate_proportion, + create_top_5_code_table, + group_low_values, + handle_edge_case_percentages, + round_values, +) +from hypothesis import given +from hypothesis import strategies as st +from hypothesis.extra.pandas import column, data_frames + + +codes_strategy = st.text(min_size=1, max_size=1).map(str) +count_strategy = st.integers(min_value=0, max_value=100) + +df_strategy = data_frames( + [ + column("code", elements=codes_strategy, unique=True), + column("num", elements=count_strategy), + ] +) + +description_strategy = st.text(min_size=1, max_size=20).map(str) + +code_df_strategy = data_frames( + [ + column("code", elements=codes_strategy, unique=True), + column("term", elements=description_strategy), + ] +) + + +class TestGroupLowValues: + @given(df=df_strategy) + def test_values_above_threshold(self, df): + result = group_low_values(df, "num", "code", 5) + assert ( + result["num"] > 5 + ).all(), "Values below the threshold were not redacted." + + @given(df=df_strategy) + def test_redacted_rows_grouped_into_other(self, df): + result = group_low_values(df, "num", "code", 5) + if (result["code"] == "Other").any(): + assert ( + result.loc[result["code"] == "Other", "num"].sum() >= 5 + ), "'Other' row sum is below threshold." + + @given(df=df_strategy) + def test_all_zero_values_are_suppressed(self, df): + if df["num"].sum() == 0: + result = group_low_values(df, "num", "code", 5) + assert result.empty, "Zero values were not suppressed." + + @given(df=df_strategy) + def test_no_redaction_when_all_values_above_threshold(self, df): + if df["num"].all() > 5: + result = group_low_values(df, "num", "code", 5) + assert result.equals( + df + ), "Redaction happened when all values were above the threshold." + + +class TestRoundValues: + @given(x=st.floats(allow_nan=True, allow_infinity=False), base=st.integers(1, 10)) + def test_rounding_floats(self, x, base): + result = round_values(x, base) + + if np.isnan(x): + assert np.isnan(result), f"Expected NaN but got {result} for input {x}" + else: + expected = int(base * round(x / base)) + assert ( + result == expected + ), f"Expected {expected} but got {result} for input {x}" + + @given(x=st.integers(min_value=0, max_value=100_000_000), base=st.integers(1, 10)) + def test_rounding_integers(self, x, base): + result = round_values(x, base) + expected = int(base * round(x / base)) + assert result == expected, f"Expected {expected} but got {result} for input {x}" + + @given(x=st.text()) + def test_non_numeric_input(self, x): + result = round_values(x) + assert result == x, f"Expected {x} but got {result} for non-numeric input" + + +@given(df=df_strategy, rounding_base=st.integers(1, 10)) +def test_apply_rounding(df, rounding_base): + result_df = apply_rounding(df.copy(), rounding_base) + + # All numbers should be rounded to the nearest multiple of rounding_base + for num in result_df["num"]: + assert num % rounding_base == 0, f"{num} not rounded to nearest {rounding_base}" + + +@given(df=df_strategy) +def test_calculate_proportion(df): + result_df = calculate_proportion(df.copy()) + + total = result_df["num"].sum() + + if total == 0: + assert all( + pd.isna(result_df["Proportion of codes (%)"]) + ), "Proportion should be NaN when total is 0." + else: + for _, row in result_df.iterrows(): + expected_proportion = round((row["num"] / total) * 100, 2) + assert ( + row["Proportion of codes (%)"] == expected_proportion + ), f"Expected {expected_proportion} but got {row['Proportion of codes (%)']} for count {row['num']}" + + +@given(event_counts=df_strategy, code_df=code_df_strategy) +def test_add_description(event_counts, code_df): + result = add_description(event_counts, code_df, "code", "term") + + # Ensure that the Description column exists + assert "Description" in result.columns + + # Ensure that the 'Description' column is filled correctly + for _, row in result.iterrows(): + if row["code"] == "Other": + assert row["Description"] == "-" + elif row["code"] in code_df["code"].values: + assert ( + row["Description"] + == code_df[code_df["code"] == row["code"]]["term"].iloc[0] + ) + else: + assert row["Description"] == "-" + + # Ensure that no rows were lost + assert len(result) == len(event_counts) + + +@given(df=df_strategy) +def test_handle_edge_case_percentages(df): + df_with_proportions = calculate_proportion(df.copy()) + result_df = handle_edge_case_percentages(df_with_proportions.copy()) + + for _, row in result_df.iterrows(): + if (row["Proportion of codes (%)"] == 0) and (row["num"] > 0): + assert ( + row["Proportion of codes (%)"] == "<0.001" + ), f"Expected '<0.001' but got {row['Proportion of codes (%)']} for num {row['num']}" + + if (row["Proportion of codes (%)"] == 100) and (row["num"] < df["num"].sum()): + assert ( + row["Proportion of codes (%)"] == ">99.99" + ), f"Expected '>99.99' but got {row['Proportion of codes (%)']} for num {row['num']}" + + +@given( + df=df_strategy, + code_df=code_df_strategy, + code_column=st.just("code"), + term_column=st.just("term"), + low_count_threshold=st.integers(min_value=1, max_value=10), + rounding_base=st.integers(min_value=1, max_value=10), + nrows=st.integers(min_value=1, max_value=10), +) +def test_create_top_5_code_table( + df, code_df, code_column, term_column, low_count_threshold, rounding_base, nrows +): + top_5, top_5_with_counts = create_top_5_code_table( + df, code_df, code_column, term_column, low_count_threshold, rounding_base, nrows + ) + + assert len(top_5) <= nrows + + # Make sure that the order is correct based on proportion + if not top_5.empty: + assert list(top_5["Proportion of codes (%)"]) == sorted( + top_5["Proportion of codes (%)"], reverse=True + ) + + # Ensure the 'complete_counts' contains all rows or the max rows whichever is smaller + assert len(top_5_with_counts) <= len(df) + + # The two results should share the same sorted order based on "Proportion of codes (%)" + if not top_5.empty and not top_5_with_counts.empty: + assert list(top_5["Code"]) == list(top_5_with_counts["Code"].head(len(top_5))) + + # Ensure the have expected columns + assert list(top_5.columns) == ["Code", "Description", "Proportion of codes (%)"] + assert list(top_5_with_counts.columns) == [ + "Code", + "num", + "Description", + "Proportion of codes (%)", + ]
    Table 2. The top 5 most common codes recorded for codelist 1 ({{ codelist_1_name }}).