From cfe0bb1eac7939fae7db67cac47bac888c8f0e4f Mon Sep 17 00:00:00 2001
From: Gowtham Rao <rao@ohdsi.org>
Date: Wed, 6 Dec 2023 06:28:33 -0500
Subject: [PATCH] U

---
 R/GetConceptRecordCount.R    | 67 +++++++++++++++++++++++++++---------
 man/getConceptRecordCount.Rd | 11 ++++++
 2 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/R/GetConceptRecordCount.R b/R/GetConceptRecordCount.R
index ff2e192..74bb315 100644
--- a/R/GetConceptRecordCount.R
+++ b/R/GetConceptRecordCount.R
@@ -34,6 +34,14 @@
 #'
 #' @param domain                      domains to look for concept id
 #'
+#' @param limitToCohort               Do you wantt to limit to a cohort_definition_id?
+#'
+#' @param cohortDatabaseSchema        Optional
+#'
+#' @param cohortTableName             Optional
+#'
+#' @param cohortDefinitionId          Optional
+#'
 #' @return
 #' Returns a tibble data frame.
 #'
@@ -46,6 +54,9 @@ getConceptRecordCount <- function(conceptIds = NULL,
                                   vocabularyDatabaseSchema = cdmDatabaseSchema,
                                   tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
                                   minCellCount = 0,
+                                  cohortDatabaseSchema = NULL,
+                                  cohortTableName = NULL,
+                                  cohortDefinitionId = NULL,
                                   domainTableName = c(
                                     "drug_exposure",
                                     "condition_occurrence",
@@ -77,6 +88,15 @@ getConceptRecordCount <- function(conceptIds = NULL,
     dplyr::filter(eraTable == FALSE)
   # filtering out ERA tables because they are supposed to be derived tables, and counting them is double counting
   
+  limitToCohort <- FALSE
+  if (all(
+    !is.null(cohortDatabaseSchema),
+    !is.null(cohortTableName),
+    !is.null(cohortDefinitionId)
+  )) {
+    limitToCohort <- TRUE
+  }
+  
   # REASON for many SQL --DISTINCT subject_count cannot be computed from aggregation query of calendar month level data
   sql <- "
 
@@ -95,6 +115,7 @@ getConceptRecordCount <- function(conceptIds = NULL,
                 ON u.concept_id = c.concept_id
           ;
           }
+
           --PERCENTILES are difficult and will need subqueries
           SELECT
               {@include_concept_id} ? {@domain_concept_id} : {0} concept_id,
@@ -111,20 +132,20 @@ getConceptRecordCount <- function(conceptIds = NULL,
           		MIN(@domain_start_date) min_date,
           		MAX(@domain_start_date) max_date,
           		COUNT(DISTINCT @domain_start_date) unique_dates,
-          		AVG(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)) prior_obs_avg,
+          		ROUND(AVG(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)), 2) prior_obs_avg,
           		MIN(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)) prior_obs_min,
           		MAX(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)) prior_obs_max,
-          		STDDEV(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)) prior_obs_std,
+          		ROUND(STDDEV(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)), 2) prior_obs_std,
           		SUM(DATEDIFF(day, op.observation_period_start_date, @domain_start_date)) prior_obs_sum,
-          		AVG(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)) post_obs_avg,
+          		ROUND(AVG(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)), 2) post_obs_avg,
           		MIN(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)) post_obs_min,
           		MAX(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)) post_obs_max,
-          		STDDEV(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)) post_obs_std,
-          		SUM(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)) sum_obs_median,
-          		AVG(DATEPART(yy, @domain_start_date) - year_of_birth) age_avg,
+          		ROUND(STDDEV(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)), 2) post_obs_std,
+          		SUM(DATEDIFF(day, @domain_start_date, op.observation_period_end_date)) post_obs_sum,
+          		ROUND(AVG(DATEPART(yy, @domain_start_date) - year_of_birth), 2) age_avg,
           		MIN(DATEPART(yy, @domain_start_date) - year_of_birth) age_min,
           		MAX(DATEPART(yy, @domain_start_date) - year_of_birth) age_max,
-          		STDDEV(DATEPART(yy, @domain_start_date) - year_of_birth) age_std,
+          		ROUND(STDDEV(DATEPART(yy, @domain_start_date) - year_of_birth), 2) age_std,
           		SUM(DATEPART(yy, @domain_start_date) - year_of_birth) age_sum
           INTO #concept_count_table
           FROM @cdm_database_schema.@domain_table dt
@@ -135,6 +156,15 @@ getConceptRecordCount <- function(conceptIds = NULL,
           INNER JOIN @cdm_database_schema.person p
           ON dt.person_id = p.person_id
 
+          {@limit_to_cohort} ? {
+            INNER JOIN
+            (
+              SELECT subject_id, cohort_start_date, cohort_end_date
+              FROM @cohort_database_schema.@cohort_table_name
+              WHERE cohort_definition_id = @cohort_definition_id
+            )
+          }
+
             {@incidence} ? {
             -- limit to first occurrence of concept id by person_id
             INNER JOIN
@@ -203,7 +233,7 @@ getConceptRecordCount <- function(conceptIds = NULL,
     tidyr::crossing(dplyr::tibble(incidence = c("Y", "N")))
   
   existingOutput <- c()
-  
+
   for (i in (1:nrow(iterations))) {
     rowData <- iterations[i, ]
     
@@ -254,7 +284,11 @@ getConceptRecordCount <- function(conceptIds = NULL,
       use_date_month = (rowData$useDateMonth == 'Y'),
       domain_table_short = rowData$domainTableShort,
       domain_field_short = rowData$domainFieldShort,
-      calendar_type = rowData$calendarType
+      calendar_type = rowData$calendarType,
+      limit_to_cohort = limitToCohort,
+      cohort_database_schema = cohortDatabaseSchema,
+      cohort_table_name = cohortTableName,
+      cohort_definition_id = cohortDefinitionId
     )
     
     # Regular expression to find a comma followed by any whitespace (including line breaks) and a semicolon
@@ -300,15 +334,16 @@ getConceptRecordCount <- function(conceptIds = NULL,
   
   existingOutput <- existingOutput |>
     dplyr::inner_join(
-      domainInformation |>
+      domainInformation$long |>
         dplyr::select(
-          domainTableShort,
-          domainFieldShort,
-          domainTable,
-          domainField
-        )
+          "domainTableShort",
+          "domainFieldShort",
+          "domainTable",
+          "domainField"
+        ),
+      by = c("domainTableShort", "domainFieldShort")
     ) |>
-    dplyr::select(-domainFieldShort,-domainTableShort)
+    dplyr::select(-"domainFieldShort", -"domainTableShort")
   
   if (!is.null(minCellCount)) {
     existingOutput <-
diff --git a/man/getConceptRecordCount.Rd b/man/getConceptRecordCount.Rd
index f8312bf..25a6372 100644
--- a/man/getConceptRecordCount.Rd
+++ b/man/getConceptRecordCount.Rd
@@ -12,6 +12,9 @@ getConceptRecordCount(
   vocabularyDatabaseSchema = cdmDatabaseSchema,
   tempEmulationSchema = getOption("sqlRenderTempEmulationSchema"),
   minCellCount = 0,
+  cohortDatabaseSchema = NULL,
+  cohortTableName = NULL,
+  cohortDefinitionId = NULL,
   domainTableName = c("drug_exposure", "condition_occurrence", "procedure_occurrence",
     "mesaurement", "observation")
 )
@@ -41,7 +44,15 @@ tables, provide a schema with write privileges where temp tables can be created.
 
 \item{minCellCount}{The minimum cell count for fields containing person/subject count.}
 
+\item{cohortDatabaseSchema}{Optional}
+
+\item{cohortTableName}{Optional}
+
+\item{cohortDefinitionId}{Optional}
+
 \item{domain}{domains to look for concept id}
+
+\item{limitToCohort}{Do you wantt to limit to a cohort_definition_id?}
 }
 \value{
 Returns a tibble data frame.