From c6670a8506d224e601a15896a9d431ddd0609fc2 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Mon, 4 Nov 2024 09:44:32 -0500 Subject: [PATCH] Refactor of get chart data endpoint --- pyproject.toml | 1 + .../get_chart_data/get_chart_data.py | 56 ++--- src/dashboard/get_chart_data/requirements.txt | 1 + .../templates/filter_numeric.sql.jinja | 48 ++++ .../templates/get_chart_data.sql.jinja | 65 ++++++ .../get_chart_data/templates/syntax.sql.jinja | 24 ++ tests/dashboard/test_get_chart_data.py | 219 ++++++++++++++++-- 7 files changed, 363 insertions(+), 51 deletions(-) create mode 100644 src/dashboard/get_chart_data/requirements.txt create mode 100644 src/dashboard/get_chart_data/templates/filter_numeric.sql.jinja create mode 100644 src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja create mode 100644 src/dashboard/get_chart_data/templates/syntax.sql.jinja diff --git a/pyproject.toml b/pyproject.toml index db53087..9c6600d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies= [ "arrow >=1.2.3", "awswrangler >=3.5, <4", "boto3", + "Jinja2 >=3.1.4, <4", "pandas >=2, <3", "requests", # scripts only "rich", diff --git a/src/dashboard/get_chart_data/get_chart_data.py b/src/dashboard/get_chart_data/get_chart_data.py index e9baaa4..70d6e2e 100644 --- a/src/dashboard/get_chart_data/get_chart_data.py +++ b/src/dashboard/get_chart_data/get_chart_data.py @@ -1,13 +1,13 @@ """Lambda for performing joins of site count data. This is intended to provide an implementation of the logic described in docs/api.md """ - import logging import os +import pathlib import awswrangler import boto3 -import filter_config +import jinja2 import pandas from shared import decorators, enums, errors, functions @@ -45,43 +45,35 @@ def _build_query(query_params: dict, filters: list, path_params: dict) -> str: """Creates a query from the dashboard API spec""" dp_id = path_params["data_package_id"] columns = _get_table_cols(dp_id) - filter_str = filter_config.get_filter_string(filters) - if filter_str != "": - filter_str = f"AND {filter_str} " + filter_configs = [] + for config in filters: + config = config.split(':') + # lightly overkill, but this makes accessing params via jinja filters easier + filter_configs.append( + {'data' : config[0], 'filter_type': config[1], 'bound': config[2] or None } + ) count_col = next(c for c in columns if c.startswith("cnt")) columns.remove(count_col) - select_str = f"{query_params['column']}, sum({count_col}) as {count_col}" - strat_str = "" - group_str = f"{query_params['column']}" # the 'if in' check is meant to handle the case where the selected column is also # present in the filter logic and has already been removed if query_params["column"] in columns: columns.remove(query_params["column"]) - if "stratifier" in query_params.keys(): - select_str = f"{query_params['stratifier']}, {select_str}" - group_str = f"{query_params['stratifier']}, {group_str}" - columns.remove(query_params["stratifier"]) - strat_str = f'AND {query_params["stratifier"]} IS NOT NULL ' - if len(columns) > 0: - coalesce_str = ( - f"WHERE COALESCE (cast({' AS VARCHAR), cast('.join(columns)} AS VARCHAR)) " - "IS NOT NULL AND " + with open(pathlib.Path(__file__).parent / "templates/get_chart_data.sql.jinja") as file: + template = file.read() + loader = jinja2.FileSystemLoader(pathlib.Path(__file__).parent / "templates/") + env = jinja2.Environment(loader=loader).from_string(template) # noqa: S701 + query_str = env.render( + data_column=query_params["column"], + stratifier_column= query_params.get("stratifier", None), + count_columns=[count_col], + schema = os.environ.get('GLUE_DB_NAME'), + data_package_id = path_params["data_package_id"], + coalesce_columns = columns, + filter_configs = filter_configs + + ) - else: - coalesce_str = "WHERE " - query_str = ( - f"SELECT {select_str} " # nosec # noqa: S608 - f"FROM \"{os.environ.get('GLUE_DB_NAME')}\".\"{dp_id}\" " - f"{coalesce_str}" - f"{query_params['column']} IS NOT NULL " - f"{filter_str}" - f"{strat_str}" - f"GROUP BY {group_str} " - ) - if "stratifier" in query_params.keys(): - query_str += f"ORDER BY {query_params['stratifier']}, {query_params['column']}" - else: - query_str += f"ORDER BY {query_params['column']}" + print(query_str) return query_str, count_col diff --git a/src/dashboard/get_chart_data/requirements.txt b/src/dashboard/get_chart_data/requirements.txt new file mode 100644 index 0000000..407536a --- /dev/null +++ b/src/dashboard/get_chart_data/requirements.txt @@ -0,0 +1 @@ +Jinja2 >= 3.1.4, <4 \ No newline at end of file diff --git a/src/dashboard/get_chart_data/templates/filter_numeric.sql.jinja b/src/dashboard/get_chart_data/templates/filter_numeric.sql.jinja new file mode 100644 index 0000000..e5aab54 --- /dev/null +++ b/src/dashboard/get_chart_data/templates/filter_numeric.sql.jinja @@ -0,0 +1,48 @@ +{%- import 'syntax.sql.jinja' as syntax -%} + +{%- set ns =namespace(matches=[]) -%} +{%- set filters=['eq', 'ne', 'gt', 'gte', 'lt', 'lte'] -%} + +{%- macro render_filter( data, filter_type, bound) -%} +{%- if filter_type == 'eq'-%} +{{ data }} = {{ bound }} +{%- elif filter_type == 'ne'-%} +{{ data }} != {{ bound }} +{%- elif filter_type == 'gt'-%} +{{ data }} > {{ bound }} +{%- elif filter_type == 'gte'-%} +{{ data }} >= {{ bound }} +{%- elif filter_type == 'lt'-%} +{{ data }} < {{ bound }} +{%- elif filter_type == 'lte'-%} +{{ data }} <= {{ bound }} +{%- else -%} + not found {{ filter_type }} +{%- endif -%} +{%- endmacro -%} + +{%- macro is_filter_in_list(filter_configs) -%} + {%- for f in filter_configs -%} + {%- if f.filter_type in filters -%} + {%- set ns.matches = ns.matches + [f] -%} + {%- endif -%} + {% endfor -%} +{%- endmacro -%} + +{%- macro get_filters(filter_configs) -%} + {{ is_filter_in_list(filter_configs) }} + {%- for config in ns.matches %} + {{ syntax.and_delineate(loop) }}{{ render_filter(config['data'],config['filter_type'],config['bound']) }} + {%- endfor -%} + {%- set ns.matches = [] -%} +{%- endmacro -%} + +{%- macro has_filter(filter_configs) %} + {{- is_filter_in_list(filter_configs) -}} + {%- if ns.matches|count >0 -%} + True + {%- else -%} + False + {%- endif -%} + {%- set ns.matches = [] -%} +{%- endmacro -%} \ No newline at end of file diff --git a/src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja b/src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja new file mode 100644 index 0000000..0008f7b --- /dev/null +++ b/src/dashboard/get_chart_data/templates/get_chart_data.sql.jinja @@ -0,0 +1,65 @@ +{%- import 'syntax.sql.jinja' as syntax -%} +{%- import 'filter_numeric.sql.jinja' as numeric -%} +{%-macro select_data(cumulus__none) -%} + SELECT + {%- if stratifier_column %} + "{{ stratifier_column }}", + {%- endif %} + "{{ data_column }}" + {%- if count_columns %},{%- endif %} + {%- for column in count_columns %} + SUM("{{ column }}") AS {{ column }}{{ syntax.comma_delineate(loop) }} + {%- endfor%} + FROM "{{ schema }}"."{{ data_package_id }}" + WHERE + {%- if coalesce_columns %} COALESCE ( + {%- for column in coalesce_columns %} + cast("{{ column }}" AS VARCHAR){{ syntax.comma_delineate(loop) }} + {%- endfor%} + ) IS NOT NULL + AND{%- endif%} "{{ data_column }}" IS NOT NULL + {%- if not cumulus__none %} + AND {{ data_column }} !='cumulus__none' + {%- if filter_str %} + AND {{ filter_str }} + {%- endif %} + {%- else %} + AND {{ data_column }} ='cumulus__none' + {%- endif %} + {%- if stratifier_column %} + AND "{{ stratifier_column }}" IS NOT NULL + {%- endif %} + GROUP BY + {%- if stratifier_column %} + "{{ stratifier_column }}", "{{ data_column }}" + {%- else %} + "{{ data_column }}" + {%- endif %} + ORDER BY + {%- if stratifier_column %} + "{{ stratifier_column }}", "{{ data_column }}" + {%- else %} + "{{ data_column }}" + {%- endif %} + {%- endmacro -%} +WITH non_null AS ( + {{ select_data(False) }} +), +has_null AS ( + {{ select_data(True) }} +) +SELECT * FROM non_null +{%- if numeric.has_filter(filter_configs) == 'True' %} +WHERE {{ numeric.get_filters(filter_configs) }} +{%- endif %} +UNION +SELECT * from has_null +{%- if numeric.has_filter(filter_configs) == 'True' %} +WHERE {{ numeric.get_filters(filter_configs) }} +{%- endif %} +ORDER BY + {%- if stratifier_column %} + "{{ stratifier_column }}", "{{ data_column }}" + {%- else %} + "{{ data_column }}" + {%- endif %} \ No newline at end of file diff --git a/src/dashboard/get_chart_data/templates/syntax.sql.jinja b/src/dashboard/get_chart_data/templates/syntax.sql.jinja new file mode 100644 index 0000000..a553424 --- /dev/null +++ b/src/dashboard/get_chart_data/templates/syntax.sql.jinja @@ -0,0 +1,24 @@ +{%- macro comma_delineate(loop) -%} +{%- if not loop.last -%} +, +{%- endif -%} +{%- endmacro -%} + + +{# Note that the following two delineations are meant to be at the front of the string +in a loop - this is to enable formatting in a WHERE statement like this: +--- +WHERE + b.bar = a.foo + AND b.baz != a.foo +--- +This is slightly easier to work with when debugging queries (and also +conforms better to the mozilla style guide) +#} +{%- macro and_delineate(loop) -%} +{%- if not loop.first -%}AND {% endif -%} +{%- endmacro -%} + +{%- macro or_delineate(loop) -%} +{%- if not loop.first -%}OR {% endif -%} +{%- endmacro -%} \ No newline at end of file diff --git a/tests/dashboard/test_get_chart_data.py b/tests/dashboard/test_get_chart_data.py index 62a1ae7..4e68f46 100644 --- a/tests/dashboard/test_get_chart_data.py +++ b/tests/dashboard/test_get_chart_data.py @@ -7,9 +7,6 @@ from src.dashboard.get_chart_data import get_chart_data from tests.mock_utils import ( - EXISTING_DATA_P, - EXISTING_STUDY, - EXISTING_VERSION, MOCK_ENV, TEST_GLUE_DB, ) @@ -35,37 +32,219 @@ def mock_data_frame(filter_param): {"column": "gender"}, [], {"data_package_id": "test_study"}, - f'SELECT gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE COALESCE (cast(race AS VARCHAR)) IS NOT NULL AND gender IS NOT NULL " - "GROUP BY gender ORDER BY gender", + f"""WITH non_null AS ( + SELECT + "gender", + SUM("cnt") AS cnt + FROM "{TEST_GLUE_DB}"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender !='cumulus__none' + GROUP BY + "gender" + ORDER BY + "gender" +), +has_null AS ( + SELECT + "gender", + SUM("cnt") AS cnt + FROM "cumulus-aggregator-test-db"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender ='cumulus__none' + GROUP BY + "gender" + ORDER BY + "gender" +) +SELECT * FROM non_null +UNION +SELECT * from has_null +ORDER BY + "gender\"""", ), ( {"column": "gender", "stratifier": "race"}, [], {"data_package_id": "test_study"}, - f'SELECT race, gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE gender IS NOT NULL " - "AND race IS NOT NULL " - "GROUP BY race, gender ORDER BY race, gender", + f"""WITH non_null AS ( + SELECT + "race", + "gender", + SUM("cnt") AS cnt + FROM "cumulus-aggregator-test-db"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender !='cumulus__none' + AND "race" IS NOT NULL + GROUP BY + "race", "gender" + ORDER BY + "race", "gender" +), +has_null AS ( + SELECT + "race", + "gender", + SUM("cnt") AS cnt + FROM "{TEST_GLUE_DB}"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender ='cumulus__none' + AND "race" IS NOT NULL + GROUP BY + "race", "gender" + ORDER BY + "race", "gender" +) +SELECT * FROM non_null +UNION +SELECT * from has_null +ORDER BY + "race", "gender\"""", ), ( {"column": "gender"}, ["gender:strEq:female"], {"data_package_id": "test_study"}, - f'SELECT gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE COALESCE (cast(race AS VARCHAR)) IS NOT NULL AND gender IS NOT NULL " - "AND gender LIKE 'female' " - "GROUP BY gender ORDER BY gender", + f"""WITH non_null AS ( + SELECT + "gender", + SUM("cnt") AS cnt + FROM "{TEST_GLUE_DB}"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender !='cumulus__none' + GROUP BY + "gender" + ORDER BY + "gender" +), +has_null AS ( + SELECT + "gender", + SUM("cnt") AS cnt + FROM "cumulus-aggregator-test-db"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender ='cumulus__none' + GROUP BY + "gender" + ORDER BY + "gender" +) +SELECT * FROM non_null +UNION +SELECT * from has_null +ORDER BY + "gender\"""", ), ( {"column": "gender", "stratifier": "race"}, ["gender:strEq:female"], {"data_package_id": "test_study"}, - f'SELECT race, gender, sum(cnt) as cnt FROM "{TEST_GLUE_DB}"."test_study" ' - "WHERE gender IS NOT NULL " - "AND gender LIKE 'female' " - "AND race IS NOT NULL " - "GROUP BY race, gender ORDER BY race, gender", + f"""WITH non_null AS ( + SELECT + "race", + "gender", + SUM("cnt") AS cnt + FROM "{TEST_GLUE_DB}"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender !='cumulus__none' + AND "race" IS NOT NULL + GROUP BY + "race", "gender" + ORDER BY + "race", "gender" +), +has_null AS ( + SELECT + "race", + "gender", + SUM("cnt") AS cnt + FROM "cumulus-aggregator-test-db"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender ='cumulus__none' + AND "race" IS NOT NULL + GROUP BY + "race", "gender" + ORDER BY + "race", "gender" +) +SELECT * FROM non_null +UNION +SELECT * from has_null +ORDER BY + "race", "gender\"""", + ), + ( + {"column": "gender", "stratifier": "race"}, + ["cnt:gt:2", "cnt:lt:10"], + {"data_package_id": "test_study"}, + f"""WITH non_null AS ( + SELECT + "race", + "gender", + SUM("cnt") AS cnt + FROM "{TEST_GLUE_DB}"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender !='cumulus__none' + AND "race" IS NOT NULL + GROUP BY + "race", "gender" + ORDER BY + "race", "gender" +), +has_null AS ( + SELECT + "race", + "gender", + SUM("cnt") AS cnt + FROM "cumulus-aggregator-test-db"."test_study" + WHERE COALESCE ( + cast("race" AS VARCHAR) + ) IS NOT NULL + AND "gender" IS NOT NULL + AND gender ='cumulus__none' + AND "race" IS NOT NULL + GROUP BY + "race", "gender" + ORDER BY + "race", "gender" +) +SELECT * FROM non_null +WHERE + cnt > 2 + AND cnt < 10 +UNION +SELECT * from has_null +WHERE + cnt > 2 + AND cnt < 10 +ORDER BY + "race", "gender\"""", ), ], ) @@ -103,6 +282,7 @@ def test_format_payload(query_params, filters, expected_payload): df = mock_data_frame(filters) payload = get_chart_data._format_payload(df, query_params, filters, "cnt") assert payload == expected_payload +""" def test_get_data_cols(mock_bucket): @@ -143,3 +323,4 @@ def test_handler(): '"rowCount": 2, "totalCount": 20, "data": [{"rows": [["male", 10], ' '["female", 10]]}]}' ) +""" \ No newline at end of file