From 34c61ba7a03f9982817fdfc56c19856480c8e48d Mon Sep 17 00:00:00 2001 From: Marta Alcalde-Herraiz Date: Tue, 5 Dec 2023 16:06:02 +0000 Subject: [PATCH] Moving across cohort matching --- DESCRIPTION | 1 + NAMESPACE | 1 + R/generateMatchedCohortSet.R | 435 ++++++++++++++++++ man/generateMatchedCohortSet.Rd | 43 ++ .../testthat/test-generateMatchedCohortSet.R | 273 +++++++++++ 5 files changed, 753 insertions(+) create mode 100644 R/generateMatchedCohortSet.R create mode 100644 man/generateMatchedCohortSet.Rd create mode 100644 tests/testthat/test-generateMatchedCohortSet.R diff --git a/DESCRIPTION b/DESCRIPTION index 0f346897..c046c00e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Imports: tidyr Suggests: DBI, + DrugUtilisation, duckdb, testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index b57d2ec2..72989855 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(generateCombinationCohortSet) +export(generateMatchedCohortSet) export(getIdentifier) export(joinOverlap) export(requireAge) diff --git a/R/generateMatchedCohortSet.R b/R/generateMatchedCohortSet.R new file mode 100644 index 00000000..7576c2e7 --- /dev/null +++ b/R/generateMatchedCohortSet.R @@ -0,0 +1,435 @@ +#' Generate a new cohort matched cohort from a preexisting target cohort. The +#' new cohort will contain individuals not included in the target cohort with +#' same year of birth (matchYearOfBirth = TRUE) and same sex (matchSex = TRUE). +#' +#' @param cdm A cdm reference object. +#' @param name Name of the new generated cohort set. +#' @param targetCohortName Name of the target cohort to match. +#' @param targetCohortId Cohort definition id to match from the target cohort. +#' If NULL all the cohort definition id present in the target cohort will be +#' matched. +#' @param matchSex Whether to match in sex. +#' @param matchYearOfBirth Whether to match in year of birth. +#' @param ratio Number of allowed matches per individual in the target cohort. +#' +#' @return A cdm reference object that contains the new generated cohort set. +#' +#' @export +#' +#' +generateMatchedCohortSet <- function(cdm, + name, + targetCohortName, + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 1){ + # validate initial input + validateInput( + cdm = cdm, name = name, targetCohortName = targetCohortName, + targetCohortId = targetCohortId, matchSex = matchSex, + matchYearOfBirth = matchYearOfBirth, ratio = ratio + ) + + # table prefix + #tablePrefix <- randomPrefix() + + # get the number of cohorts + n <- getNumberOfCohorts(cdm, targetCohortName) + + # get target cohort id + targetCohortId <- getTargetCohortId(cdm, targetCohortId, targetCohortName) + + # Create the cohort name with cases and controls of the targetCohortId + cdm <- getNewCohort(cdm, name, targetCohortName, targetCohortId, n) + + # Exclude cases from controls + cdm <- excludeCases(cdm, name, targetCohortId, n) + + # get matched tables + matchCols <- getMatchCols(matchSex, matchYearOfBirth) + + if(!is.null(matchCols)){ + # Exclude individuals without any match + cdm <- excludeNoMatchedIndividuals(cdm, name, matchCols, n) + + # Match as ratio was infinite + cdm <- infiniteMatching(cdm, name, targetCohortId) + + # Delete controls that are not in observation + cdm <- checkObservationPeriod(cdm, name, targetCohortId, n) + + # Check ratio + cdm <- checkRatio(cdm, name, ratio, targetCohortId, n) + + # Check cohort set ref + cdm <- checkCohortSetRef(cdm, name, matchSex, matchYearOfBirth, targetCohortId, n) + } + # Return + return(cdm) +} + +#' @noRd +validateInput <- function(cdm, + name, + targetCohortName, + targetCohortId, + matchSex, + matchYearOfBirth, + ratio) { + errorMessage <- checkmate::makeAssertCollection() + # Check cdm class + data_check <- any("cdm_reference" == class(cdm)) + checkmate::assertTRUE(data_check, add = errorMessage) + if(!isTRUE(data_check)){ + errorMessage$push(glue::glue("- cdm input must be a cdm object")) + } + # Check if targetCohortName is a character + targetCohortName_format_check <- any(class(targetCohortName) %in% c("character")) + checkmate::assertTRUE(targetCohortName_format_check, add = errorMessage) + if(!isTRUE(targetCohortName_format_check)){ + errorMessage$push(glue::glue("- targetCohortName input must be a string")) + } + # Check if targetCohortName length + targetCohortName_length_check <- length(targetCohortName) == 1 + checkmate::assertTRUE( targetCohortName_length_check, add = errorMessage) + if(!isTRUE( targetCohortName_length_check)){ + errorMessage$push(glue::glue("- targetCohortName input must have length equal to 1")) + } + # Check if targetCohortName is within the cdm object + targetCohortName_check <- targetCohortName %in% names(cdm) + checkmate::assertTRUE(targetCohortName_check, add = errorMessage) + if(!isTRUE(targetCohortName_check)){ + errorMessage$push(glue::glue("- cdm input has not table named {targetCohortName}")) + } + # Check if observation period is within the cdm object + observation_period_check <- "observation_period" %in% names(cdm) + checkmate::assertTRUE(observation_period_check , add = errorMessage) + if(!isTRUE(observation_period_check)){ + errorMessage$push(glue::glue("- cdm input has not table named 'observation_period'")) + } + # Check if targetCohortId is a numeric value + if(!is.null(targetCohortId)){ + targetCohortId_format_check <- any(class(targetCohortId) %in% c("numeric","double","integer")) + checkmate::assertTRUE(targetCohortId_format_check, add = errorMessage) + if(!isTRUE(targetCohortId_format_check)){ + errorMessage$push(glue::glue("- targetCohortId input must be numeric")) + } + } + # Check if targetCohortId is in the cohort_definition_id + if(!is.null(targetCohortId)){ + rows <- cdm[[targetCohortName]] %>% dplyr::filter(.data$cohort_definition_id %in% targetCohortId) %>% dplyr::tally() %>% dplyr::pull() + targetCohortId_check <- rows != 0 + checkmate::assertTRUE(targetCohortId_check, add = errorMessage) + if(!isTRUE(targetCohortId_check)){ + errorMessage$push(glue::glue("- {name} table does not containg '{targetCohortId}' as a cohort_definition_id")) + } + } + checkmate::reportAssertions(collection = errorMessage) + return(invisible(TRUE)) +} + + + +randomPrefix <- function(n = 5) { + paste0( + "temp_", paste0(sample(letters, 5, TRUE), collapse = ""), "_", collapse = "" + ) +} + + +getNumberOfCohorts <- function(cdm, targetCohortName){ + # Read number of cohorts + n <- cdm[[targetCohortName]] %>% + dplyr::summarise(v = max(.data$cohort_definition_id)) %>% + dplyr::pull(.data$v) # number of different cohorts + + if(is.na(n)){# Empty table, number of cohorts is 0 + n <- 0 + } + return(n) +} + + +getTargetCohortId <- function(cdm, targetCohortId, targetCohortName){ + if(is.null(targetCohortId)){ + targetCohortId <- CDMConnector::cohortSet(cdm[[targetCohortName]]) %>% + dplyr::arrange(.data$cohort_definition_id) %>% + dplyr::pull("cohort_definition_id") + } + + return(targetCohortId) +} + + +getNewCohort <- function(cdm, name, targetCohortName, targetCohortId, n){ + if(n == 0){ + cdm[[name]] <- CDMConnector::new_generated_cohort_set( + cohort_ref = cdm[[targetCohortName]] %>% CDMConnector::compute_query(schema = attr(cdm, "write_schema"), + temporary = FALSE, + name = name, + overwrite = TRUE), + cohort_attrition_ref = cdm[[targetCohortName]] %>% CDMConnector::cohort_attrition(), + cohort_set_ref = cdm[[targetCohortName]] %>% CDMConnector::cohort_set(), + overwrite = TRUE) + }else{ + # Create controls cohort + controls <- lapply(targetCohortId+n, function(x) { + cdm[["person"]] %>% + dplyr::select("subject_id" = "person_id") %>% + dplyr::mutate("cohort_definition_id" = .env$x) + }) + + # Create table with controls + cases (all cases existing in the cohort, without considering the targetCohortId) + all <- Reduce(dplyr::union_all, controls) %>% + dplyr::mutate("cohort_start_date" = NA, + "cohort_end_date" = NA) %>% + dplyr::union_all( + cdm[[targetCohortName]] %>% + dplyr::filter(.data$cohort_definition_id %in% .env$targetCohortId) + ) %>% + CDMConnector::compute_query(schema = attr(cdm, "write_schema"), + temporary = FALSE, + name = name, + overwrite = TRUE) + + cdm[[name]] <- CDMConnector::new_generated_cohort_set( + cohort_ref = all, + overwrite = TRUE) + + cohort_set_ref <- cdm[[targetCohortName]] %>% + CDMConnector::cohort_set() %>% + dplyr::filter(.data$cohort_definition_id %in% .env$targetCohortId) %>% + dplyr::slice(rep(1:dplyr::n(), times = 2)) %>% + dplyr::group_by(.data$cohort_definition_id) %>% + dplyr::mutate(cohort_name = dplyr::if_else(dplyr::row_number() == 2, paste0(.data$cohort_name,"_matched"), .data$cohort_name)) %>% + dplyr::mutate(cohort_definition_id = dplyr::if_else(dplyr::row_number() == 2, .data$cohort_definition_id+.env$n, .data$cohort_definition_id)) %>% + dplyr::ungroup() + + + cohort_attrition <- CDMConnector::cohort_attrition(cdm[[name]]) %>% + dplyr::mutate(reason = dplyr::if_else(.data$cohort_definition_id %in% c(.env$n+(1:.env$n)), "Subjects in the database", .data$reason)) + + + cdm[[name]] <- CDMConnector::new_generated_cohort_set( + cohort_ref = all, + cohort_attrition_ref = cohort_attrition, + cohort_set_ref = cohort_set_ref, + overwrite = TRUE) + } + return(cdm) +} + +excludeCases <- function(cdm, name, targetCohortId, n){ + # For each target cohort id + for(targetCohortId_i in targetCohortId){ + # Controls + controls <- cdm[[name]] %>% + dplyr::select("subject_id", "cohort_definition_id") %>% + dplyr::filter(.data$cohort_definition_id == targetCohortId_i+.env$n) %>% + dplyr::anti_join( + # Cases + cdm[[name]] %>% + dplyr::select("subject_id","cohort_definition_id") %>% + dplyr::filter(.data$cohort_definition_id == targetCohortId_i) %>% + dplyr::mutate(cohort_definition_id = targetCohortId_i + .env$n), + by = c("subject_id", "cohort_definition_id") + ) + + cdm[[name]] <- cdm[[name]] %>% + # Delete the controls + dplyr::filter(.data$cohort_definition_id != targetCohortId_i + .env$n) %>% + # Add the new controls set + dplyr::union_all( + controls %>% + dplyr::mutate(cohort_start_date = NA, + cohort_end_date = NA) + ) %>% + CDMConnector::computeQuery() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + + } + # Record attrition + cdm[[name]] <- cdm[[name]] %>% + CDMConnector::record_cohort_attrition("Exclude cases", + cohortId = c(targetCohortId+n))%>% + CDMConnector::computeQuery() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + return(cdm) +} + + +getMatchCols <- function(matchSex, matchYearOfBirth){ + # Obtain matched columns + matchCols <- c() + if(matchSex){ + matchCols <- append(matchCols, "gender_concept_id") + } + if(matchYearOfBirth){ + matchCols <- append(matchCols, "year_of_birth") + } + return(matchCols) +} + +excludeNoMatchedIndividuals <- function(cdm, name, matchCols, n){ + cdm[[name]] <- cdm[[name]] %>% + # Append matchcols + dplyr::left_join( + cdm[["person"]] %>% + dplyr::select("subject_id" = "person_id", dplyr::all_of(matchCols)), + by = c("subject_id") + ) %>% + CDMConnector::computeQuery() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + + # Create column group id + cdm[[name]] <- cdm[[name]] %>% + dplyr::inner_join( + cdm[[name]] %>% + dplyr::select(dplyr::all_of(matchCols)) %>% + dplyr::distinct() %>% + dplyr::mutate(group_id = dplyr::row_number()), + by = c(matchCols) + ) %>% + dplyr::select(-dplyr::all_of(matchCols)) %>% + # Create target definition id column + dplyr::mutate(target_definition_id = + dplyr::if_else( + .data$cohort_definition_id <= .env$n, + .data$cohort_definition_id, + .data$cohort_definition_id - .env$n + )) %>% + CDMConnector::compute_query() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + + # Exclude individuals that do not have any match + cdm[[name]] <- cdm[[name]] %>% + dplyr::inner_join( + cdm[[name]] %>% + dplyr::mutate( + "cohort_definition_id" = dplyr::if_else( + .data$target_definition_id == .data$cohort_definition_id, + .data$cohort_definition_id + .env$n, + .data$cohort_definition_id - .env$n + ) + ) %>% + dplyr::select("cohort_definition_id", "target_definition_id", "group_id") %>% + dplyr::distinct(), + by = c("target_definition_id", "group_id", "cohort_definition_id") + ) %>% + CDMConnector::compute_query() %>% + CDMConnector::compute_query(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) %>% + CDMConnector::record_cohort_attrition("Exclude individuals that do not have any match") + + return(cdm) +} + + +infiniteMatching <- function(cdm, name, targetCohortId){ + # Create pair id to perform a random match + cdm[[name]] <- cdm[[name]] %>% + dplyr::mutate(id = dbplyr::sql("random()")) %>% + dplyr::group_by(.data$cohort_definition_id, .data$group_id) %>% + dbplyr::window_order(.data$id) %>% + dplyr::mutate(pair_id = dplyr::row_number()) %>% + dplyr::select(-"id") %>% + dplyr::ungroup() %>% + CDMConnector::computeQuery() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + + cdm[[name]] <- cdm[[name]] %>% + dplyr::inner_join( + # Calculate the maximum number of cases per group + cdm[[name]] %>% + dplyr::filter(.data$cohort_definition_id %in% .env$targetCohortId) %>% + dplyr::group_by(.data$cohort_definition_id, .data$group_id) %>% + dplyr::mutate(max_cases = max(.data$pair_id)) %>% + dplyr::ungroup() %>% + dplyr::select("group_id", "target_definition_id", "max_cases"), + by = c("group_id", "target_definition_id") + ) %>% + # Calculate the maximum ratio per group + dplyr::mutate(id = (.data$pair_id-1) %% .data$max_cases + 1) %>% + dplyr::mutate(pair_id = .data$id) %>% + dplyr::select(-"max_cases", -"id") %>% + CDMConnector::compute_query() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + + + # Perform random matches with ratio 1:Inf + cdm[[name]] <- cdm[[name]] %>% + dplyr::select(-"cohort_start_date", -"cohort_end_date") %>% + dplyr::inner_join( + # Cohort start date and end date of cases + cdm[[name]] %>% + dplyr::filter(!is.na(.data$cohort_start_date), !is.na(.data$cohort_end_date)) %>% + dplyr::select("pair_id", "group_id", "target_definition_id", "cohort_start_date", "cohort_end_date"), + by = c("pair_id", "group_id", "target_definition_id") + ) %>% + dplyr::distinct() %>% + CDMConnector::compute_query() %>% + CDMConnector::computeQuery(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + return(cdm) +} + +checkObservationPeriod <- function(cdm, name, targetCohortId, n){ + cdm[[name]] <- cdm[[name]] %>% + PatientProfiles::addFutureObservation() %>% + dplyr::filter(!is.na(.data$future_observation)) %>% + dplyr::mutate(cohort_end_date = dplyr::if_else( + .data$cohort_definition_id %in% .env$targetCohortId, + .data$cohort_end_date, + as.Date(!!CDMConnector::dateadd("cohort_start_date", "future_observation")) + )) %>% + dplyr::select(-"future_observation") %>% + dplyr::group_by(.data$target_definition_id, .data$group_id, .data$pair_id) %>% + dplyr::filter(dplyr::n() > 1) %>% + dplyr::ungroup() %>% + CDMConnector::compute_query() %>% + CDMConnector::compute_query(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) %>% + CDMConnector::record_cohort_attrition("Exclude individuals that are not in observation", cohortId = targetCohortId + n) %>% + CDMConnector::record_cohort_attrition("Exclude individuals that their only pair is not in observation", cohortId = targetCohortId) + return(cdm) +} + + +checkRatio <- function(cdm, name, ratio, targetCohortId, n){ + if(ratio == Inf){ + cdm[[name]] <- cdm[[name]] %>% + dplyr::select("cohort_definition_id", "subject_id", "cohort_start_date", "cohort_end_date") %>% + CDMConnector::compute_query() %>% + CDMConnector::compute_query(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) + }else{ + cdm[[name]] <- cdm[[name]] %>% + dplyr::group_by(.data$pair_id, .data$group_id, .data$target_definition_id) %>% + dbplyr::window_order(.data$cohort_definition_id) %>% + dplyr::filter(dplyr::row_number() <= .env$ratio+1) %>% + dplyr::ungroup() %>% + dplyr::select("cohort_definition_id", "subject_id", "cohort_start_date", "cohort_end_date") %>% + CDMConnector::compute_query() %>% + CDMConnector::compute_query(name = name, temporary = FALSE, schema = attr(cdm, "write_schema"), overwrite = TRUE) %>% + CDMConnector::record_cohort_attrition("Exclude individuals that do not fulfil the ratio", cohortId = targetCohortId+n) + } + + + return(cdm) +} + + +checkCohortSetRef <- function(cdm, name, matchSex, matchYearOfBirth, targetCohortId, n){ + cohort_set_ref <- cdm[[name]] %>% + CDMConnector::cohort_set() %>% + dplyr::mutate(target_cohort_name = .env$name) %>% + dplyr::mutate(match_sex = .env$matchSex) %>% + dplyr::mutate(match_year_of_birth = .env$matchYearOfBirth) %>% + dplyr::mutate(match_status = dplyr::if_else(.data$cohort_definition_id %in% .env$targetCohortId, "target", "matched")) %>% + dplyr::mutate(target_cohort_id = dplyr::if_else(.data$cohort_definition_id %in% .env$targetCohortId, .data$cohort_definition_id, .data$cohort_definition_id-n)) + + cdm[[name]] <- CDMConnector::new_generated_cohort_set( + cohort_ref = cdm[[name]], + cohort_attrition_ref = cdm[[name]] %>% CDMConnector::cohort_attrition(), + cohort_set_ref = cohort_set_ref, + overwrite = TRUE) + + return(cdm) +} diff --git a/man/generateMatchedCohortSet.Rd b/man/generateMatchedCohortSet.Rd new file mode 100644 index 00000000..baa9aac7 --- /dev/null +++ b/man/generateMatchedCohortSet.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/generateMatchedCohortSet.R +\name{generateMatchedCohortSet} +\alias{generateMatchedCohortSet} +\title{Generate a new cohort matched cohort from a preexisting target cohort. The +new cohort will contain individuals not included in the target cohort with +same year of birth (matchYearOfBirth = TRUE) and same sex (matchSex = TRUE).} +\usage{ +generateMatchedCohortSet( + cdm, + name, + targetCohortName, + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 1 +) +} +\arguments{ +\item{cdm}{A cdm reference object.} + +\item{name}{Name of the new generated cohort set.} + +\item{targetCohortName}{Name of the target cohort to match.} + +\item{targetCohortId}{Cohort definition id to match from the target cohort. +If NULL all the cohort definition id present in the target cohort will be +matched.} + +\item{matchSex}{Whether to match in sex.} + +\item{matchYearOfBirth}{Whether to match in year of birth.} + +\item{ratio}{Number of allowed matches per individual in the target cohort.} +} +\value{ +A cdm reference object that contains the new generated cohort set. +} +\description{ +Generate a new cohort matched cohort from a preexisting target cohort. The +new cohort will contain individuals not included in the target cohort with +same year of birth (matchYearOfBirth = TRUE) and same sex (matchSex = TRUE). +} diff --git a/tests/testthat/test-generateMatchedCohortSet.R b/tests/testthat/test-generateMatchedCohortSet.R new file mode 100644 index 00000000..a7fa3070 --- /dev/null +++ b/tests/testthat/test-generateMatchedCohortSet.R @@ -0,0 +1,273 @@ +test_that("generateMatchedCohortSet runs without errors", { + # Create cdm object + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 200), + conceptSet = list(asthma = 317009), + name = "cases", + end = "observation_period_end_date", + requiredObservation = c(180, 180), + overwrite = TRUE) + + expect_no_error(a <- generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases", + ratio = 2)) + + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 200), + conceptSet = list(asthma = 317009, other = 4141052, other1 = 432526), + name = "cases", + end = "observation_period_end_date", + requiredObservation = c(10,10), + overwrite = TRUE) + + expect_no_error(generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases")) + + expect_no_error(generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases", + ratio = 3)) + + expect_no_error(generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases", + ratio = Inf)) + + expect_no_error(generateMatchedCohortSet(cdm, + name = "new_cohort", + matchSex = FALSE, + matchYearOfBirth = TRUE, + targetCohortName = "cases")) + + expect_no_error(generateMatchedCohortSet(cdm, + name = "new_cohort", + matchSex = TRUE, + matchYearOfBirth = FALSE, + targetCohortName = "cases")) + + expect_no_error(b <- generateMatchedCohortSet(cdm, + name = "new_cohort", + matchSex = FALSE, + matchYearOfBirth = FALSE, + targetCohortName = "cases")) + + expect_no_error(a <- generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases", + targetCohortId = c(1,2), + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 2)) + +}) + + +test_that("generateMatchedCohortSet, no duplicated people within a cohort", { + followback <- 180 + + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 200), + conceptSet = list(asthma = 317009, other = 432526), + name = "cohort", + end = "observation_period_end_date", + requiredObservation = c(followback,followback), + overwrite = TRUE + ) + + a <- generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cohort", + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 1) + + p1 <- a$new_cohort %>% + dplyr::filter(cohort_definition_id == 1) %>% + dplyr::select(subject_id) %>% + dplyr::pull() %>% + length() + expect_true(length(p1) == length(unique(p1))) + + + a <- generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cohort", + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 5) + p1 <- a$new_cohort %>% + dplyr::filter(cohort_definition_id == 2) %>% + dplyr::select(subject_id) %>% + dplyr::pull() %>% + length() + expect_true(length(p1) == length(unique(p1))) + +}) + +test_that("check that we obtain expected result when ratio is 1", { + followback <- 180 + + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 200), + conceptSet = list(c1 = 317009, c2 = 432526, c3 = 4141052), + name = "cohort", + end = "observation_period_end_date", + requiredObservation = c(followback,followback), + overwrite = TRUE + ) + + # Number of counts for the initial cohorts are the same as in the matched cohorts + matched_cohorts <- generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cohort", + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 1) + expect_true( + length(CDMConnector::cohort_count(matched_cohorts[["new_cohort"]]) %>% + dplyr::select("number_records") %>% + dplyr::pull() %>% + unique()) == 3 + ) + + # Everybody has a matched + n <- matched_cohorts[["new_cohort"]] %>% + dplyr::summarise(n = max(.data$cohort_definition_id)/2) %>% + dplyr::pull() + + cohorts <- matched_cohorts[["new_cohort"]] %>% + dplyr::select("person_id" = "subject_id", "cohort_definition_id") %>% + dplyr::inner_join( + matched_cohorts[["person"]] %>% + dplyr::select("person_id", "gender_concept_id", "year_of_birth"), + by = "person_id" + ) + + + expect_true(is.na(nrow(cohorts %>% + dplyr::filter(.data$cohort_definition_id %in% c(1,2,3)) %>% + dplyr::left_join( + cohorts %>% + dplyr::filter(.data$cohort_definition_id %in% c(4,5,6)) %>% + dplyr::mutate("cohort_definition_id" = .data$cohort_definition_id-n), + by = c("cohort_definition_id", "gender_concept_id", "year_of_birth") + ) %>% + dplyr::filter( + is.na(person_id.y) + )))) +}) + + + +test_that("test exactMatchingCohort works if there are no subjects", { + followback <- 180 + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 200), + conceptSet = list(asthma = 317009), + name = "cases", + end = "observation_period_end_date", + requiredObservation = c(followback,followback), + overwrite = TRUE + ) + cdm$cases <- cdm$cases %>% dplyr::filter(subject_id == 0) + expect_no_error( + generateMatchedCohortSet( + cdm, + name = "new_cohort", + targetCohortName = "cases", + ) + ) +}) + + +test_that("test exactMatchingCohort works if one of the cohorts does not have any people", { + followback <- 180 + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 200), + conceptSet = list(c1 = 317009, c2 = 1), + name = "cases", + end = "observation_period_end_date", + requiredObservation = c(followback,followback), + overwrite = TRUE + ) + + + expect_no_error( + generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases", + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 1) + ) +}) + + + +test_that("test exactMatchingCohort with a ratio bigger than 1", { + followback <- 180 + cdm <- DrugUtilisation::generateConceptCohortSet( + cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 1000), + conceptSet = list(c1 = 317009, c2 = 432526), + name = "cases", + end = "observation_period_end_date", + requiredObservation = c(followback,followback), + overwrite = TRUE + ) + + + + expect_no_error( + a <- generateMatchedCohortSet(cdm, + name = "new_cohort", + targetCohortName = "cases", + targetCohortId = NULL, + matchSex = TRUE, + matchYearOfBirth = TRUE, + ratio = 5) + ) +}) + + +test_that("test exactMatchingCohort with a ratio bigger than 1", { + # Generate mock data + # cdm[["person"]] <- tibble::tibble("person_id" ) + # # Generate mock data + # cdm[["person"]] <- tibble::tibble("person_id" = c(1,2)) + # cdm <- DrugUtilisation::generateConceptCohortSet( + # cdm = DrugUtilisation::mockDrugUtilisation(numberIndividuals = 1000), + # conceptSet = list(c1 = 317009, c2 = 432526), + # name = "cases", + # end = "observation_period_end_date", + # requiredObservation = c(followback,followback), + # overwrite = TRUE + # ) + # + # + # + # expect_no_error( + # a <- generateMatchedCohortSet(cdm, + # name = "new_cohort", + # targetCohortName = "cases", + # targetCohortId = NULL, + # matchSex = TRUE, + # matchYearOfBirth = TRUE, + # ratio = 5) + # ) +}) + +# +# +# a[["new_cohort"]] %>% +# dplyr::inner_join(a[["person"]] %>% +# dplyr::select("subject_id" = "person_id", "gender_concept_id", "year_of_birth"), +# by = "subject_id") %>% +# dplyr::filter(cohort_definition_id %in% c(1,3)) %>% +# dplyr::group_by(gender_concept_id, year_of_birth) %>% +# dplyr::mutate(n = dplyr::row_number()) %>% print(n = 100)