From 857b4dd85ef1e03d7077b41c9bf0a54d7f92ddce Mon Sep 17 00:00:00 2001 From: Diego Quintana Date: Wed, 7 Feb 2024 17:06:51 +0100 Subject: [PATCH] feature: build models matching dset gaps analysis (et/somenergia-jardiner!89) * ci: make dbt build run incremental models on pre at merge requests * chore: bump naming and style * chore: format and presentation * change: update references to and conditions in test use materialized model instead of view and present results by date in descending order * new: sql to analyze gaps for a specific day given a report from dset Co-authored-by: Roger * chore: move tests to tests/dset and update yml config remove gaps tests that are no longer being used and keep just one for simplicity * chore: format query * chore: format query using fully qualified references * feature: add new test that checks incoming samples on a daily basis * change: limit gap test to last 6 months relative to current_date only instead of all time * change: refactor test so that ignores null values from api --- .gitlab-ci.yml | 5 +- .../analyses/dset_gaps_preview_20240110.sql | 39 ++++++++ .../intermediate/dset/_int_dset__tests.yaml | 4 +- .../int_dset_responses__spined_metadata.sql | 1 + ...t_active_power__missing_phase_last_day.sql | 0 ...set_gaps_per_day_and_signal_last_month.sql | 88 +++++++++++++++++++ ...have_references_in_signal_denormalized.sql | 0 ...set_raw_uuids_not_empty_or_null_3_days.sql | 0 .../test_dset_signals_receiver_all_time.sql | 0 .../test_dset_signals_receiver_last_hour.sql | 38 ++++++++ .../{ => dset}/test_gapfill_5min_nrows.sql | 0 .../test_repeated_device_uuid_errors.sql | 0 ...gnal_is_correctly_converted_to_numeric.sql | 0 .../test_solargis_temperature_registry.sql | 0 .../tests/{ => dset}/test_spine_hourly.sql | 0 ...set_gaps_per_month_and_signal_all_time.sql | 77 ---------------- ...et_gaps_per_month_and_signal_last_hour.sql | 77 ---------------- .../test_dset_signals_receiver_last_hour.sql | 35 -------- 18 files changed, 172 insertions(+), 192 deletions(-) create mode 100644 dbt_jardiner/analyses/dset_gaps_preview_20240110.sql rename dbt_jardiner/tests/{ => dset}/test_dset_active_power__missing_phase_last_day.sql (100%) create mode 100644 dbt_jardiner/tests/dset/test_dset_gaps_per_day_and_signal_last_month.sql rename dbt_jardiner/tests/{ => dset}/test_dset_incoming_uuids_have_references_in_signal_denormalized.sql (100%) rename dbt_jardiner/tests/{ => dset}/test_dset_raw_uuids_not_empty_or_null_3_days.sql (100%) rename dbt_jardiner/tests/{ => dset}/test_dset_signals_receiver_all_time.sql (100%) create mode 100644 dbt_jardiner/tests/dset/test_dset_signals_receiver_last_hour.sql rename dbt_jardiner/tests/{ => dset}/test_gapfill_5min_nrows.sql (100%) rename dbt_jardiner/tests/{ => dset}/test_repeated_device_uuid_errors.sql (100%) rename dbt_jardiner/tests/{ => dset}/test_signal_is_correctly_converted_to_numeric.sql (100%) rename dbt_jardiner/tests/{ => dset}/test_solargis_temperature_registry.sql (100%) rename dbt_jardiner/tests/{ => dset}/test_spine_hourly.sql (100%) delete mode 100644 dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_all_time.sql delete mode 100644 dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_last_hour.sql delete mode 100644 dbt_jardiner/tests/test_dset_signals_receiver_last_hour.sql diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 61520af0..c73f2562 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -135,6 +135,7 @@ dbt-build: DBUSER: $SOM_JARDINER_DB_USER DBPASSWORD: $SOM_JARDINER_DB_PASSWORD DBNAME: $SOM_JARDINER_DB_DBNAME + DBT_MODELS_SELECTOR: "--models state:modified+" DBT_MANIFEST_ARTIFACT_URL: "https://$CI_SERVER_HOST/api/v4/projects/$CI_PROJECT_ID/jobs/artifacts/${CI_DEFAULT_BRANCH}/download?job=pages&job_token=$CI_JOB_TOKEN" image: ${SOM_HARBOR_DADES_URL}/${SOM_PROJECT_NAME}-dbt-docs:latest script: @@ -145,7 +146,7 @@ dbt-build: - curl --location --output /tmp/artifacts.zip ${DBT_MANIFEST_ARTIFACT_URL} - unzip -o /tmp/artifacts.zip -d /tmp/artifacts - cp /tmp/artifacts/public/dbt_docs/manifest.json ${CI_PROJECT_DIR}/${DBT_PROJECT_DIR_NAME}/state/prod/manifest.json - - dbt build --target ${DBT_TARGET_NAME} --store-failures --threads 4 --models state:modified+ --state state/prod + - dbt build --target ${DBT_TARGET_NAME} --store-failures --threads 4 ${DBT_MODELS_SELECTOR} --state state/prod tags: - somenergia-et rules: @@ -153,12 +154,14 @@ dbt-build: when: always variables: DBT_TARGET_NAME: prod + DBT_MODELS_SELECTOR: "--models state:modified+" changes: paths: *dbt-build-changes-paths allow_failure: false - if: $CI_PIPELINE_SOURCE == "merge_request_event" when: always variables: + DBT_MODELS_SELECTOR: "--models state:modified+ tag:dset_responses_fresh" DBT_TARGET_NAME: pre DBT_FAIL_FAST: "True" changes: diff --git a/dbt_jardiner/analyses/dset_gaps_preview_20240110.sql b/dbt_jardiner/analyses/dset_gaps_preview_20240110.sql new file mode 100644 index 00000000..8980df64 --- /dev/null +++ b/dbt_jardiner/analyses/dset_gaps_preview_20240110.sql @@ -0,0 +1,39 @@ +with + som_uuids as ( + select distinct group_name, signal_code, signal_uuid + from {{ ref("int_dset_responses__materialized") }} + where ts >= '2024-01-10' and ts < '2024-01-11' + order by signal_uuid desc), + + dset_uuids as ( + select a.gru_codi, a.gru_nom, a.sen_codi, a.sen_descripcio, a.esperats_frequencia, a.trobats_senyal, b.signal_uuid + from analytics.se_forats_hornsby_dades_dia_10 as a + left join som_uuids as b + on a.sen_codi = b.signal_code + and a.gru_nom = b.group_name), + + som_count as ( + select + signal_uuid, count(*) as cnt + from {{ ref("int_dset_responses__materialized") }} + where ts >= '2024-01-10' and ts < '2024-01-11' + and signal_value is not null + group by signal_uuid + order by signal_uuid desc, cnt desc), + + summary as ( + select b.*, a.cnt as som_trobats_senyal from som_count as a + left join dset_uuids as b + on a.signal_uuid = b.signal_uuid + order by b.trobats_senyal desc, a.cnt desc + ), + + final as ( + select *, + esperats_frequencia - trobats_senyal as n_forats_dset, + 288 - som_trobats_senyal as n_forats_som + from summary + ) + +select * from final +order by gru_codi, gru_nom, sen_codi diff --git a/dbt_jardiner/models/jardiner/intermediate/dset/_int_dset__tests.yaml b/dbt_jardiner/models/jardiner/intermediate/dset/_int_dset__tests.yaml index 56d0a5bf..23ceeadf 100644 --- a/dbt_jardiner/models/jardiner/intermediate/dset/_int_dset__tests.yaml +++ b/dbt_jardiner/models/jardiner/intermediate/dset/_int_dset__tests.yaml @@ -43,9 +43,9 @@ tests: config: severity: warning - - name: test_dset_gaps_per_month_and_signal_all_time + - name: test_dset_gaps_per_day_and_signal_last_month description: > - Taula que recull el número de forats per mes i senyal, de tota la història de senyals que hem rebut + Taula que recull el número de forats per dia i senyal, del últim mes de dades en curs config: severity: warning diff --git a/dbt_jardiner/models/jardiner/intermediate/dset/int_dset_responses__spined_metadata.sql b/dbt_jardiner/models/jardiner/intermediate/dset/int_dset_responses__spined_metadata.sql index bb9a77d2..cb5fcd36 100644 --- a/dbt_jardiner/models/jardiner/intermediate/dset/int_dset_responses__spined_metadata.sql +++ b/dbt_jardiner/models/jardiner/intermediate/dset/int_dset_responses__spined_metadata.sql @@ -29,6 +29,7 @@ dset_from_december_2023 as ( {# if we don't limit queried_at the planner shits the bed #} and queried_at > '2023-12-01' ), + spined_dset as ( select spined.ts, diff --git a/dbt_jardiner/tests/test_dset_active_power__missing_phase_last_day.sql b/dbt_jardiner/tests/dset/test_dset_active_power__missing_phase_last_day.sql similarity index 100% rename from dbt_jardiner/tests/test_dset_active_power__missing_phase_last_day.sql rename to dbt_jardiner/tests/dset/test_dset_active_power__missing_phase_last_day.sql diff --git a/dbt_jardiner/tests/dset/test_dset_gaps_per_day_and_signal_last_month.sql b/dbt_jardiner/tests/dset/test_dset_gaps_per_day_and_signal_last_month.sql new file mode 100644 index 00000000..4b610138 --- /dev/null +++ b/dbt_jardiner/tests/dset/test_dset_gaps_per_day_and_signal_last_month.sql @@ -0,0 +1,88 @@ +{{ config(severity="warn") }} + +with + window_observed as ( + select + signal_value, + group_name, + signal_code, + signal_id, + signal_device_type, + signal_uuid, + queried_at, + ts as current_ts, + signal_frequency::interval as signal_frequency + from {{ ref("int_dset_responses__materialized") }} + where current_date - interval '1 month' < ts + and signal_value is not null + ), + + window_lagged as ( + select + *, + lag(current_ts) over ( + partition by signal_uuid + order by current_ts asc) as previous_ts + from window_observed + ), + + gaps_observed as ( + select + *, + current_ts::date as "date", + current_ts - previous_ts as gap + from window_lagged + where current_ts - previous_ts > signal_frequency + ), + + gaps_summarized as ( + select + "date", + group_name, + signal_code, + signal_id, + signal_device_type, + signal_uuid, + gap, + signal_frequency, + count(signal_uuid) as n_gaps + from gaps_observed + group by + "date", + group_name, + signal_uuid, + signal_code, + signal_id, + signal_device_type, + gap, + signal_frequency + order by + "date" desc, + count(signal_uuid) desc, + gap desc, + group_name asc, + signal_code asc, + signal_id asc, + signal_device_type asc + ), + + gaps_converted_to_n_missing_samples as ( + select + *, + -- n_gaps * ceiling(gap/frequency - 1) as n_missing_samples. + -- The -1 is because the starting point in the gap can't be counted as missing + n_gaps * ceil(extract(epoch from gap) / extract(epoch from signal_frequency) - 1) as n_missing_samples, + -- 24*60 are the minutes in a day + (24 * 60) / (extract(epoch from signal_frequency) / 60) as n_samples_per_day + from gaps_summarized + ), + + gaps_ratio as ( + select + *, + n_missing_samples / n_samples_per_day as ratio_missing_samples + from gaps_converted_to_n_missing_samples + ) + +select * +from gaps_ratio diff --git a/dbt_jardiner/tests/test_dset_incoming_uuids_have_references_in_signal_denormalized.sql b/dbt_jardiner/tests/dset/test_dset_incoming_uuids_have_references_in_signal_denormalized.sql similarity index 100% rename from dbt_jardiner/tests/test_dset_incoming_uuids_have_references_in_signal_denormalized.sql rename to dbt_jardiner/tests/dset/test_dset_incoming_uuids_have_references_in_signal_denormalized.sql diff --git a/dbt_jardiner/tests/test_dset_raw_uuids_not_empty_or_null_3_days.sql b/dbt_jardiner/tests/dset/test_dset_raw_uuids_not_empty_or_null_3_days.sql similarity index 100% rename from dbt_jardiner/tests/test_dset_raw_uuids_not_empty_or_null_3_days.sql rename to dbt_jardiner/tests/dset/test_dset_raw_uuids_not_empty_or_null_3_days.sql diff --git a/dbt_jardiner/tests/test_dset_signals_receiver_all_time.sql b/dbt_jardiner/tests/dset/test_dset_signals_receiver_all_time.sql similarity index 100% rename from dbt_jardiner/tests/test_dset_signals_receiver_all_time.sql rename to dbt_jardiner/tests/dset/test_dset_signals_receiver_all_time.sql diff --git a/dbt_jardiner/tests/dset/test_dset_signals_receiver_last_hour.sql b/dbt_jardiner/tests/dset/test_dset_signals_receiver_last_hour.sql new file mode 100644 index 00000000..ea97c8e3 --- /dev/null +++ b/dbt_jardiner/tests/dset/test_dset_signals_receiver_last_hour.sql @@ -0,0 +1,38 @@ +{{ config(error_if=">500") }} +{# error limit is set on half the number of signal uuids available #} + +with +uuids_received_recently as ( + select + signal_uuid, + true as is_received_recently, + max(ts) as last_received_ts + from {{ ref("int_dset_responses__materialized") }} + where ts >= (now() - interval '2 hours') + group by signal_uuid + {# interval used of two hours is depending on the natural delay of dset data + materialization cycle -#} +), + +uuids_expected as ( + select + s.plant_uuid, + s.plant_name, + s.signal_name, + s.signal_uuid, + s.device_name, + s.device_type, + s.device_uuid, + coalesce(r.is_received_recently, false) as received_from_dset + from {{ ref("raw_gestio_actius__signal_denormalized") }} as s + left join uuids_received_recently as r + on s.signal_uuid = r.signal_uuid + order by s.plant_name +), + +uuids_not_received as ( + select * + from uuids_expected + where received_from_dset is false +) + +select * from uuids_not_received diff --git a/dbt_jardiner/tests/test_gapfill_5min_nrows.sql b/dbt_jardiner/tests/dset/test_gapfill_5min_nrows.sql similarity index 100% rename from dbt_jardiner/tests/test_gapfill_5min_nrows.sql rename to dbt_jardiner/tests/dset/test_gapfill_5min_nrows.sql diff --git a/dbt_jardiner/tests/test_repeated_device_uuid_errors.sql b/dbt_jardiner/tests/dset/test_repeated_device_uuid_errors.sql similarity index 100% rename from dbt_jardiner/tests/test_repeated_device_uuid_errors.sql rename to dbt_jardiner/tests/dset/test_repeated_device_uuid_errors.sql diff --git a/dbt_jardiner/tests/test_signal_is_correctly_converted_to_numeric.sql b/dbt_jardiner/tests/dset/test_signal_is_correctly_converted_to_numeric.sql similarity index 100% rename from dbt_jardiner/tests/test_signal_is_correctly_converted_to_numeric.sql rename to dbt_jardiner/tests/dset/test_signal_is_correctly_converted_to_numeric.sql diff --git a/dbt_jardiner/tests/test_solargis_temperature_registry.sql b/dbt_jardiner/tests/dset/test_solargis_temperature_registry.sql similarity index 100% rename from dbt_jardiner/tests/test_solargis_temperature_registry.sql rename to dbt_jardiner/tests/dset/test_solargis_temperature_registry.sql diff --git a/dbt_jardiner/tests/test_spine_hourly.sql b/dbt_jardiner/tests/dset/test_spine_hourly.sql similarity index 100% rename from dbt_jardiner/tests/test_spine_hourly.sql rename to dbt_jardiner/tests/dset/test_spine_hourly.sql diff --git a/dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_all_time.sql b/dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_all_time.sql deleted file mode 100644 index a0ff2505..00000000 --- a/dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_all_time.sql +++ /dev/null @@ -1,77 +0,0 @@ -{{ config(severity="warn") }} - -with - window_observed as ( - select - group_name, - signal_code, - signal_id, - signal_device_type, - signal_uuid, - queried_at, - ts as current_ts, - signal_frequency::interval as signal_frequency, - lag(ts) over (partition by signal_uuid order by ts asc) as previous_ts - from {{ ref("int_dset_responses__materialized") }} - ), - - gaps as ( - select - *, - current_ts - previous_ts as gap, - extract('year' from current_ts) as "year", - extract('month' from current_ts) as "month" - from window_observed - where current_ts - previous_ts > signal_frequency - and current_date - interval '1 day' < current_ts - ), - - summarized as ( - select - group_name, - signal_code, - signal_id, - signal_device_type, - signal_uuid, - gap, - signal_frequency, - "year", - "month", - max(queried_at) as last_queried_at, - max(current_ts) as last_current_ts, - max(queried_at - current_ts) as max_waiting_time, - min(queried_at - current_ts) as min_waiting_time, - count(signal_uuid) as n_gaps - from gaps - group by - group_name, - signal_uuid, - signal_code, - signal_id, - signal_device_type, - gap, - signal_frequency, - "year", - "month" - order by - count(signal_uuid) desc, - gap desc, - group_name asc, - signal_code asc, - signal_id asc, - signal_device_type asc, - "year" desc, - "month" asc - ), - - gap_converted as ( - select - *, - {#- n_gaps * ceiling(gap/frequency - 1) as n_missing_samples. - The -1 is because the starting point in the gap can't be counted as missing #} - n_gaps * ceil(extract(epoch from gap) / extract(epoch from signal_frequency) - 1) as n_missing_samples - from summarized - ) - -select * -from gap_converted diff --git a/dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_last_hour.sql b/dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_last_hour.sql deleted file mode 100644 index 93baebdc..00000000 --- a/dbt_jardiner/tests/test_dset_gaps_per_month_and_signal_last_hour.sql +++ /dev/null @@ -1,77 +0,0 @@ -{{ config(severity="warn") }} - -with - window_observed as ( - select - group_name, - signal_code, - signal_id, - signal_device_type, - signal_uuid, - queried_at, - ts as current_ts, - signal_frequency::interval as signal_frequency, - lag(ts) over (partition by signal_uuid order by ts asc) as previous_ts - from {{ ref("int_dset_responses__view_current_hour") }} - ), - - gaps as ( - select - *, - current_ts - previous_ts as gap, - extract('year' from current_ts) as "year", - extract('month' from current_ts) as "month" - from window_observed - where current_ts - previous_ts > signal_frequency - and current_date - interval '1 day' < current_ts - ), - - summarized as ( - select - group_name, - signal_code, - signal_id, - signal_device_type, - signal_uuid, - gap, - signal_frequency, - "year", - "month", - max(queried_at) as last_queried_at, - max(current_ts) as last_current_ts, - max(queried_at - current_ts) as max_waiting_time, - min(queried_at - current_ts) as min_waiting_time, - count(signal_uuid) as n_gaps - from gaps - group by - group_name, - signal_uuid, - signal_code, - signal_id, - signal_device_type, - gap, - signal_frequency, - "year", - "month" - order by - count(signal_uuid) desc, - gap desc, - group_name asc, - signal_code asc, - signal_id asc, - signal_device_type asc, - "year" desc, - "month" asc - ), - - gap_converted as ( - select - *, - {#- n_gaps * ceiling(gap/frequency - 1) as n_missing_samples. - The -1 is because the starting point in the gap can't be counted as missing #} - n_gaps * ceil(extract(epoch from gap) / extract(epoch from signal_frequency) - 1) as n_missing_samples - from summarized - ) - -select * -from gap_converted diff --git a/dbt_jardiner/tests/test_dset_signals_receiver_last_hour.sql b/dbt_jardiner/tests/test_dset_signals_receiver_last_hour.sql deleted file mode 100644 index 38adcaa2..00000000 --- a/dbt_jardiner/tests/test_dset_signals_receiver_last_hour.sql +++ /dev/null @@ -1,35 +0,0 @@ -{{ config(error_if=">500") }} -{# error limit is set on half the number of signal uuids available #} - -with -valors as ( - select distinct - signal_uuid, - true as received_from_dset_pre - from {{ ref("int_dset_responses__materialized") }} - {# interval used of two hours is depending on the natural delay of dset data + materialization cycle -#} - where ts >= (now() - interval '2 hours') -), - -joined as ( - select - signals.plant_uuid, - signals.plant_name, - signals.signal_name, - signals.signal_uuid, - signals.device_name, - signals.device_type, - signals.device_uuid, - coalesce(valors.received_from_dset_pre, false) as received_from_dset - from {{ ref("raw_gestio_actius__signal_denormalized") }} as signals - left join valors on signals.signal_uuid = valors.signal_uuid - order by signals.plant_name -), - -filtered as ( - select * - from joined - where received_from_dset is false -) - -select * from filtered