From 179b9aa2529536f405e4f70c9f3f67a6401cf604 Mon Sep 17 00:00:00 2001 From: Peter Inglesby Date: Mon, 20 Nov 2023 16:32:50 +0000 Subject: [PATCH] feat: Handle duplicate records in ons_deaths in EMIS The implementation matches the implementation in TPP --- cohortextractor/emis_backend.py | 12 +++++--- tests/test_emis_backend.py | 50 +++++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/cohortextractor/emis_backend.py b/cohortextractor/emis_backend.py index a8b236fe..f9bd0cb7 100644 --- a/cohortextractor/emis_backend.py +++ b/cohortextractor/emis_backend.py @@ -1530,15 +1530,18 @@ def patients_with_these_codes_on_death_certificate( ) else: code_conditions = "1 = 1" + # The ONS deaths data contains some duplicate patient IDs. In most + # cases these are exact duplicate rows, but in same cases the same + # patient appears twice with different dates and death or a different + # underlying cause of death. We handle this by (arbitrarily) taking the + # earliest date of death or the lexically smallest ICD-10 code. if returning == "binary_flag": column_definition = "1" elif returning == "date_of_death": # Yes, we're converting an integer to a string to a timestamp to a date. - column_definition = ( - "CAST(date_parse(CAST(o.reg_stat_dod AS VARCHAR), '%Y%m%d') AS date)" - ) + column_definition = "MIN(CAST(date_parse(CAST(o.reg_stat_dod AS VARCHAR), '%Y%m%d') AS date))" elif returning == "underlying_cause_of_death": - column_definition = "o.icd10u" + column_definition = "MIN(o.icd10u)" else: raise ValueError(f"Unsupported `returning` value: {returning}") # ONS_TABLE is updated with each release of data from ONS, so we need to @@ -1554,6 +1557,7 @@ def patients_with_these_codes_on_death_certificate( WHERE ({code_conditions}) AND {date_condition} AND o.upload_date = (SELECT MAX(upload_date) FROM {ONS_TABLE}) + GROUP BY p.registration_id, p.hashed_organisation """ def patients_died_from_any_cause( diff --git a/tests/test_emis_backend.py b/tests/test_emis_backend.py index fb43408b..29dce129 100644 --- a/tests/test_emis_backend.py +++ b/tests/test_emis_backend.py @@ -1566,8 +1566,15 @@ def test_patients_with_these_codes_on_death_certificate(): nhs_no="bbb", ONSDeath=[ ONSDeaths( - reg_stat_dod=20210101, icd10u=code, upload_date="2020-04-01" - ) + reg_stat_dod=20210101, + icd10u=code, + upload_date="2020-03-01", + ), + ONSDeaths( + reg_stat_dod=20210101, + icd10u=code, + upload_date="2020-04-01", + ), ], ), # Died of something else @@ -1575,20 +1582,39 @@ def test_patients_with_these_codes_on_death_certificate(): nhs_no="ccc", ONSDeath=[ ONSDeaths( - reg_stat_dod=20200201, icd10u="MI", upload_date="2020-04-01" - ) + reg_stat_dod=20200201, + icd10u="MI", + upload_date="2020-03-01", + ), + ONSDeaths( + reg_stat_dod=20200201, + icd10u="MI", + upload_date="2020-04-01", + ), ], ), # Covid underlying cause Patient( nhs_no="ddd", ONSDeath=[ + ONSDeaths( + reg_stat_dod=20200201, + icd10u=code, + icd10014="MI", + upload_date="2020-03-01", + ), ONSDeaths( reg_stat_dod=20200201, icd10u=code, icd10014="MI", upload_date="2020-04-01", - ) + ), + ONSDeaths( + reg_stat_dod=20200202, + icd10u=code, + icd10014="MJ", + upload_date="2020-04-01", + ), ], ), # Covid not underlying cause @@ -1600,7 +1626,19 @@ def test_patients_with_these_codes_on_death_certificate(): icd10u="MI", icd10014=code, upload_date="2020-04-01", - ) + ), + ONSDeaths( + reg_stat_dod=20200302, + icd10u="MJ", + icd10014=code, + upload_date="2020-04-01", + ), + ONSDeaths( + reg_stat_dod=20200301, + icd10u="MI", + icd10014=code, + upload_date="2020-04-01", + ), ], ), ]