Merge pull request #8 from RMI-PACTA/7-prepare-abcd

add prepare_abcd script
RMI-PACTA · Apr 3, 2024 · 7a601dd · 7a601dd
2 parents 87162d9 + 904d28f
commit 7a601dd
Show file tree

Hide file tree

Showing 4 changed files with 195 additions and 22 deletions.
diff --git a/example.config.yml b/example.config.yml
@@ -9,15 +9,16 @@ default:
     filename_raw: "raw_loanbook_123.csv"
     filename_scenario_tms: "scenarios_2022_tms.csv"
     filename_scenario_sda: "scenarios_2022_sda.csv"
-    filename_abcd: "abcd.csv"
+    filename_abcd: "abcd.xlsx"
+    sheet_abcd: "Company Indicators - PACTA Comp"
   project_parameters:
     scenario_source: "weo_2022"
     scenario_select: "nze_2050"
     region_select: "global"
     # normally the start year should correspond with year of the publication of
     # the scenario in use
-    start_year_select: 2022
-    time_frame_select: 5
+    start_year: 2022
+    time_frame: 5
     # regions must be available for the selected scenario
     benchmark_regions_select: "global,european union"
     remove_inactive_companies: TRUE
@@ -44,3 +45,5 @@ default:
       filename_own_sector_classification: "own_sector_classification.csv"
   match_prioritize:
     priority: NULL
+  prepare_abcd:
+    remove_inactive_companies: TRUE
diff --git a/expected_columns.R b/expected_columns.R
@@ -32,22 +32,22 @@ col_types_scenario_sda <- readr::cols_only(
 col_select_scenario_sda <- names(col_types_scenario_sda[["cols"]])
 
 # expected columns abcd file
-col_types_abcd <- readr::cols_only(
-  company_id = "i",
-  name_company = "c",
-  lei = "c",
-  is_ultimate_owner = "l",
-  sector = "c",
-  technology = "c",
-  plant_location = "c",
-  year = "i",
-  production = "n",
-  production_unit = "c",
-  emission_factor = "n",
-  emission_factor_unit = "c",
-  ald_timestamp = "c"
+cols_abcd <- tibble::tribble(
+  ~col_names_abcd, ~col_types_abcd,
+  "company_id", "numeric",
+  "name_company", "text",
+  "lei", "text",
+  "is_ultimate_owner", "logical",
+  "sector", "text",
+  "technology", "text",
+  "plant_location", "text",
+  "year", "numeric",
+  "production", "numeric",
+  "production_unit", "text",
+  "emission_factor", "numeric",
+  "emission_factor_unit", "text",
+  "ald_timestamp", "text"
 )
-col_select_abcd <- names(col_types_abcd[["cols"]])
 
 # expected columns matched_prioritized_all_groups file
 col_types_matched_prio_all_groups <- readr::cols_only(

diff --git a/prepare_abcd.R b/prepare_abcd.R
@@ -0,0 +1,165 @@
+# set up project and load packages----
+library(dplyr, warn.conflicts = FALSE)
+library(readr)
+library(readxl)
+library(tidyr)
+
+# source helpers----
+source("expected_columns.R")
+
+# load config----
+config_dir <- config::get("directories")
+config_files <- config::get("file_names")
+
+path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd)
+sheet_abcd <- config_files$sheet_abcd
+
+config_prepare_abcd <- config::get("prepare_abcd")
+
+prepare_abcd_rm_inactive_companies <- config_prepare_abcd$remove_inactive_companies
+
+config_project_parameters <- config::get("project_parameters")
+
+project_parameters_start_year <- config_project_parameters$start_year
+project_parameters_time_frame <- config_project_parameters$time_frame
+
+# validate config values----
+if (!length(path_abcd) == 1) {
+  stop("Argument path_abcd must be of length 1. Please check your input.")
+}
+if (!inherits(path_abcd, "character")) {
+  stop("Argument path_abcd must be of class character. Please check your input.")
+}
+if (!length(sheet_abcd) == 1) {
+  stop("Argument sheet_abcd must be of length 1. Please check your input.")
+}
+if (!inherits(sheet_abcd, "character")) {
+  stop("Argument sheet_abcd must be of class character. Please check your input.")
+}
+if (!is.null(prepare_abcd_rm_inactive_companies)) {
+  if (!length(prepare_abcd_rm_inactive_companies) == 1) {
+    stop("Argument prepare_abcd_rm_inactive_companies must be of length 1. Please check your input.")
+  }
+  if (!inherits(prepare_abcd_rm_inactive_companies, "logical")) {
+    stop("Argument prepare_abcd_rm_inactive_companies must be of class logical. Please check your input.")
+  }
+}
+if (!length(project_parameters_start_year) == 1) {
+  stop("Argument project_parameters_start_year must be of length 1. Please check your input.")
+}
+if (!inherits(project_parameters_start_year, "integer")) {
+  stop("Argument project_parameters_start_year must be of class integer Please check your input.")
+}
+if (!length(project_parameters_time_frame) == 1) {
+  stop("Argument project_parameters_time_frame must be of length 1. Please check your input.")
+}
+if (!inherits(project_parameters_time_frame, "integer")) {
+  stop("Argument project_parameters_time_frame must be of class integer Please check your input.")
+}
+
+
+# load data----
+if (!file.exists(path_abcd)) {
+  stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!"))
+}
+
+abcd <- readxl::read_xlsx(
+  path = file.path(path_abcd),
+  sheet = sheet_abcd,
+  col_types = cols_abcd$col_types_abcd
+)
+if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
+  stop("Columns in abcd do not match expected input names. Please check your input.")
+}
+
+# optional: remove inactive companies----
+
+# (1) remove company-sector combinations where production in t5 = 0 when
+# it was greater than 0 in t0.
+# (2) remove company-sector combinations where production is 0 for the entire
+# time frame from t0 to t5.
+rm_inactive_companies <- function(data,
+                                  start_year,
+                                  time_frame) {
+  comp_sec_no_prod_t5 <- data %>%
+    dplyr::filter(
+      year %in% c(.env$start_year, .env$start_year + .env$time_frame)
+    ) %>%
+    dplyr::summarise(
+      sum_production = sum(.data$production, na.rm = TRUE),
+      .by = c("name_company", "sector", "year")
+    ) %>%
+    tidyr::pivot_wider(
+      names_from = "year",
+      names_prefix = "prod_",
+      values_from = "sum_production"
+    ) %>%
+    dplyr::filter(
+      !!rlang::sym(paste0("prod_", start_year)) > 0,
+      !!rlang::sym(paste0("prod_", start_year + time_frame)) == 0
+    ) %>%
+    dplyr::distinct(
+      .data$name_company,
+      .data$sector
+    )
+
+  comp_sec_no_prod_t0_to_t5 <- data %>%
+    dplyr::filter(
+      year %in% c(.env$start_year, .env$start_year + .env$time_frame)
+    ) %>%
+    dplyr::summarise(
+      sum_production = sum(.data$production, na.rm = TRUE),
+      .by = c("name_company", "sector")
+    ) %>%
+    dplyr::filter(
+      .data$sum_production == 0
+    ) %>%
+    dplyr::distinct(
+      .data$name_company,
+      .data$sector
+    )
+
+  data <- data %>%
+    dplyr::anti_join(
+      comp_sec_no_prod_t5,
+      by = c("name_company", "sector")
+    ) %>%
+    dplyr::anti_join(
+      comp_sec_no_prod_t0_to_t5,
+      by = c("name_company", "sector")
+    )
+
+  return(data)
+}
+
+if (prepare_abcd_rm_inactive_companies) {
+  abcd_keep <- abcd %>%
+    rm_inactive_companies(
+      start_year = project_parameters_start_year,
+      time_frame = project_parameters_time_frame
+    )
+
+  abcd_removed <- abcd %>%
+    dplyr::anti_join(
+      abcd_keep,
+      by = c("company_id", "sector")
+    )
+
+  # write removed inactive companies to file for inspection
+  abcd_removed %>%
+    readr::write_csv(
+      file.path(config_dir$dir_abcd, "abcd_removed_inactive_companies.csv"),
+      na = ""
+    )
+
+  abcd <- abcd_keep
+
+  rm(abcd_keep)
+}
+
+# write final version of abcd to file for use PACTA analysis
+abcd %>%
+  readr::write_csv(
+    file.path(config_dir$dir_abcd, "abcd_final.csv"),
+    na = ""
+  )
diff --git a/run_matching.R b/run_matching.R
@@ -3,6 +3,7 @@ library(dplyr, warn.conflicts = FALSE)
 library(r2dii.data)
 library(r2dii.match)
 library(readr)
+library(readxl)
 library(withr)
 
 # source helpers----
@@ -15,6 +16,7 @@ config_files <- config::get("file_names")
 
 dir_raw <- config_dir$dir_raw
 path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd)
+sheet_abcd <- config_files$sheet_abcd
 dir_matched <- config_dir$dir_matched
 
 config_matching <- config::get("matching")
@@ -113,11 +115,14 @@ if (!file.exists(path_abcd)) {
   stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!"))
 }
 
-abcd <- readr::read_csv(
-  file.path(path_abcd),
-  col_types = col_types_abcd,
-  col_select = dplyr::all_of(col_select_abcd)
+abcd <- readxl::read_xlsx(
+  path = file.path(path_abcd),
+  sheet = sheet_abcd,
+  col_types = cols_abcd$col_types_abcd
 )
+if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
+  stop("Columns in abcd do not match expected input names. Please check your input.")
+}
 
 ## optionally load own classification system----
 if (matching_use_own_sector_classification) {