From f2904de79e749616966b8b994029788fcd2f9b6f Mon Sep 17 00:00:00 2001
From: jacobvjk <jacob.kastl@gmail.com>
Date: Tue, 2 Apr 2024 17:48:48 +0200
Subject: [PATCH 1/2] add prepare_abcd script

---
 example.config.yml |   9 ++-
 expected_columns.R |  30 ++++----
 prepare_abcd.R     | 167 +++++++++++++++++++++++++++++++++++++++++++++
 run_matching.R     |  13 ++--
 4 files changed, 197 insertions(+), 22 deletions(-)
 create mode 100644 prepare_abcd.R

diff --git a/example.config.yml b/example.config.yml
index c2936ef3..aac6ef28 100644
--- a/example.config.yml
+++ b/example.config.yml
@@ -9,15 +9,16 @@ default:
     filename_raw: "raw_loanbook_123.csv"
     filename_scenario_tms: "scenarios_2022_tms.csv"
     filename_scenario_sda: "scenarios_2022_sda.csv"
-    filename_abcd: "abcd.csv"
+    filename_abcd: "abcd.xlsx"
+    sheet_abcd: "Company Indicators - PACTA Comp"
   project_parameters:
     scenario_source: "weo_2022"
     scenario_select: "nze_2050"
     region_select: "global"
     # normally the start year should correspond with year of the publication of
     # the scenario in use
-    start_year_select: 2022
-    time_frame_select: 5
+    start_year: 2022
+    time_frame: 5
     # regions must be available for the selected scenario
     benchmark_regions_select: "global,european union"
     remove_inactive_companies: TRUE
@@ -42,6 +43,8 @@ default:
       use_own_sector_classification: FALSE
       dir_own_sector_classification: "path/to/own_sector_classification_folder"
       filename_own_sector_classification: "own_sector_classification.csv"
+  prepare_abcd:
+    remove_inactive_companies: TRUE
 
 
 
diff --git a/expected_columns.R b/expected_columns.R
index daa66362..00ecdb37 100644
--- a/expected_columns.R
+++ b/expected_columns.R
@@ -32,22 +32,22 @@ col_types_scenario_sda <- readr::cols_only(
 col_select_scenario_sda <- names(col_types_scenario_sda[["cols"]])
 
 # expected columns abcd file
-col_types_abcd <- readr::cols_only(
-  company_id = "i",
-  name_company = "c",
-  lei = "c",
-  is_ultimate_owner = "l",
-  sector = "c",
-  technology = "c",
-  plant_location = "c",
-  year = "i",
-  production = "n",
-  production_unit = "c",
-  emission_factor = "n",
-  emission_factor_unit = "c",
-  ald_timestamp = "c"
+cols_abcd <- tibble::tribble(
+  ~col_names_abcd, ~col_types_abcd,
+  "company_id", "numeric",
+  "name_company", "text",
+  "lei", "text",
+  "is_ultimate_owner", "logical",
+  "sector", "text",
+  "technology", "text",
+  "plant_location", "text",
+  "year", "numeric",
+  "production", "numeric",
+  "production_unit", "text",
+  "emission_factor", "numeric",
+  "emission_factor_unit", "text",
+  "ald_timestamp", "text"
 )
-col_select_abcd <- names(col_types_abcd[["cols"]])
 
 # expected columns matched_prioritized_all_groups file
 col_types_matched_prio_all_groups <- readr::cols_only(
diff --git a/prepare_abcd.R b/prepare_abcd.R
new file mode 100644
index 00000000..a4713616
--- /dev/null
+++ b/prepare_abcd.R
@@ -0,0 +1,167 @@
+# set up project and load packages----
+library(dplyr, warn.conflicts = FALSE)
+library(readr)
+library(readxl)
+library(tidyr)
+
+# source helpers----
+source("expected_columns.R")
+
+# load config----
+config_dir <- config::get("directories")
+config_files <- config::get("file_names")
+
+path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd)
+sheet_abcd <- config_files$sheet_abcd
+
+config_prepare_abcd <- config::get("prepare_abcd")
+
+prepare_abcd_rm_inactive_companies <- config_prepare_abcd$remove_inactive_companies
+
+config_project_parameters <- config::get("project_parameters")
+
+project_parameters_start_year <- config_project_parameters$start_year
+project_parameters_time_frame <- config_project_parameters$time_frame
+
+# validate config values----
+if (!length(path_abcd) == 1) {
+  stop("Argument path_abcd must be of length 1. Please check your input.")
+}
+if (!inherits(path_abcd, "character")) {
+  stop("Argument path_abcd must be of class character. Please check your input.")
+}
+if (!length(sheet_abcd) == 1) {
+  stop("Argument sheet_abcd must be of length 1. Please check your input.")
+}
+if (!inherits(sheet_abcd, "character")) {
+  stop("Argument sheet_abcd must be of class character. Please check your input.")
+}
+if (!is.null(prepare_abcd_rm_inactive_companies)) {
+  if (!length(prepare_abcd_rm_inactive_companies) == 1) {
+    stop("Argument prepare_abcd_rm_inactive_companies must be of length 1. Please check your input.")
+  }
+  if (!inherits(prepare_abcd_rm_inactive_companies, "logical")) {
+    stop("Argument prepare_abcd_rm_inactive_companies must be of class logical. Please check your input.")
+  }
+}
+if (!length(project_parameters_start_year) == 1) {
+  stop("Argument project_parameters_start_year must be of length 1. Please check your input.")
+}
+if (!inherits(project_parameters_start_year, "integer")) {
+  stop("Argument project_parameters_start_year must be of class integer Please check your input.")
+}
+if (!length(project_parameters_time_frame) == 1) {
+  stop("Argument project_parameters_time_frame must be of length 1. Please check your input.")
+}
+if (!inherits(project_parameters_time_frame, "integer")) {
+  stop("Argument project_parameters_time_frame must be of class integer Please check your input.")
+}
+
+
+# load data----
+if (!file.exists(path_abcd)) {
+  stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!"))
+}
+
+abcd <- readxl::read_xlsx(
+  path = file.path(path_abcd),
+  sheet = sheet_abcd,
+  col_types = cols_abcd$col_types_abcd
+)
+if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
+  stop("Columns in abcd do not match expected input names. Please check your input.")
+}
+
+# optional: remove inactive companies----
+
+# remove company-sector combinations where production in t5 = 0 when
+# it was greater than 0 in t0.
+rm_inactive_companies <- function(data,
+                                  start_year,
+                                  time_frame) {
+  data_no_prod_t5 <- data %>%
+    dplyr::filter(
+      year %in% c(.env$start_year, .env$start_year + .env$time_frame)
+    ) %>%
+    dplyr::summarise(
+      sum_production = sum(.data$production, na.rm = TRUE),
+      .by = c("name_company", "sector", "year")
+    ) %>%
+    tidyr::pivot_wider(
+      names_from = "year",
+      names_prefix = "prod_",
+      values_from = "sum_production"
+    ) %>%
+    dplyr::filter(
+      !!rlang::sym(paste0("prod_", start_year)) > 0,
+      !!rlang::sym(paste0("prod_", start_year + time_frame)) == 0
+    )
+
+  comp_sec_no_prod_t5 <- data_no_prod_t5 %>%
+    dplyr::distinct(
+      .data$name_company,
+      .data$sector
+    )
+
+  data_no_prod_t0_to_t5 <- data %>%
+    dplyr::filter(
+      year %in% c(.env$start_year, .env$start_year + .env$time_frame)
+    ) %>%
+    dplyr::summarise(
+      sum_production = sum(.data$production, na.rm = TRUE),
+      .by = c("name_company", "sector")
+    ) %>%
+    dplyr::filter(
+      .data$sum_production == 0
+    )
+
+  comp_sec_no_prod_t0_to_t5 <- data_no_prod_t0_to_t5 %>%
+    dplyr::distinct(
+      .data$name_company,
+      .data$sector
+    )
+
+  data <- data %>%
+    dplyr::anti_join(
+      comp_sec_no_prod_t5,
+      by = c("name_company", "sector")
+    ) %>%
+    dplyr::anti_join(
+      comp_sec_no_prod_t0_to_t5,
+      by = c("name_company", "sector")
+    )
+
+  return(data)
+}
+
+if (prepare_abcd_rm_inactive_companies) {
+  abcd_keep <- abcd %>%
+    rm_inactive_companies(
+      start_year = project_parameters_start_year,
+      time_frame = project_parameters_time_frame
+    )
+
+  abcd_removed <- abcd %>%
+    dplyr::anti_join(
+      abcd_keep,
+      by = c("company_id", "sector")
+    )
+
+  # write removed inactive companies to file for inspection
+  abcd_removed %>%
+    readr::write_csv(
+      file.path(config_dir$dir_abcd, "abcd_removed_inactive_companies.csv"),
+      na = ""
+    )
+
+  abcd <- abcd_keep
+
+  rm(abcd_keep)
+}
+
+# write final version of abcd to file for use PACTA analysis
+abcd %>%
+  readr::write_csv(
+    file.path(config_dir$dir_abcd, "abcd_final.csv"),
+    na = ""
+  )
diff --git a/run_matching.R b/run_matching.R
index eb7d62b3..baa52775 100644
--- a/run_matching.R
+++ b/run_matching.R
@@ -3,6 +3,7 @@ library(dplyr, warn.conflicts = FALSE)
 library(r2dii.data)
 library(r2dii.match)
 library(readr)
+library(readxl)
 library(withr)
 
 # source helpers----
@@ -15,6 +16,7 @@ config_files <- config::get("file_names")
 
 dir_raw <- config_dir$dir_raw
 path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd)
+sheet_abcd <- config_files$sheet_abcd
 dir_matched <- config_dir$dir_matched
 
 config_matching <- config::get("matching")
@@ -113,11 +115,14 @@ if (!file.exists(path_abcd)) {
   stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!"))
 }
 
-abcd <- readr::read_csv(
-  file.path(path_abcd),
-  col_types = col_types_abcd,
-  col_select = dplyr::all_of(col_select_abcd)
+abcd <- readxl::read_xlsx(
+  path = file.path(path_abcd),
+  sheet = sheet_abcd,
+  col_types = cols_abcd$col_types_abcd
 )
+if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
+  stop("Columns in abcd do not match expected input names. Please check your input.")
+}
 
 ## optionally load own classification system----
 if (matching_use_own_sector_classification) {

From 904d28f06c1f11ce09eb3a9b826287f9f35f5881 Mon Sep 17 00:00:00 2001
From: jacobvjk <jacob.kastl@gmail.com>
Date: Wed, 3 Apr 2024 09:44:45 +0200
Subject: [PATCH 2/2] simplify

---
 prepare_abcd.R | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/prepare_abcd.R b/prepare_abcd.R
index a4713616..79aa0d20 100644
--- a/prepare_abcd.R
+++ b/prepare_abcd.R
@@ -74,12 +74,14 @@ if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
 
 # optional: remove inactive companies----
 
-# remove company-sector combinations where production in t5 = 0 when
+# (1) remove company-sector combinations where production in t5 = 0 when
 # it was greater than 0 in t0.
+# (2) remove company-sector combinations where production is 0 for the entire
+# time frame from t0 to t5.
 rm_inactive_companies <- function(data,
                                   start_year,
                                   time_frame) {
-  data_no_prod_t5 <- data %>%
+  comp_sec_no_prod_t5 <- data %>%
     dplyr::filter(
       year %in% c(.env$start_year, .env$start_year + .env$time_frame)
     ) %>%
@@ -95,15 +97,13 @@ rm_inactive_companies <- function(data,
     dplyr::filter(
       !!rlang::sym(paste0("prod_", start_year)) > 0,
       !!rlang::sym(paste0("prod_", start_year + time_frame)) == 0
-    )
-
-  comp_sec_no_prod_t5 <- data_no_prod_t5 %>%
+    ) %>%
     dplyr::distinct(
       .data$name_company,
       .data$sector
     )
 
-  data_no_prod_t0_to_t5 <- data %>%
+  comp_sec_no_prod_t0_to_t5 <- data %>%
     dplyr::filter(
       year %in% c(.env$start_year, .env$start_year + .env$time_frame)
     ) %>%
@@ -113,9 +113,7 @@ rm_inactive_companies <- function(data,
     ) %>%
     dplyr::filter(
       .data$sum_production == 0
-    )
-
-  comp_sec_no_prod_t0_to_t5 <- data_no_prod_t0_to_t5 %>%
+    ) %>%
     dplyr::distinct(
       .data$name_company,
       .data$sector