From f2904de79e749616966b8b994029788fcd2f9b6f Mon Sep 17 00:00:00 2001 From: jacobvjk Date: Tue, 2 Apr 2024 17:48:48 +0200 Subject: [PATCH 1/2] add prepare_abcd script --- example.config.yml | 9 ++- expected_columns.R | 30 ++++---- prepare_abcd.R | 167 +++++++++++++++++++++++++++++++++++++++++++++ run_matching.R | 13 ++-- 4 files changed, 197 insertions(+), 22 deletions(-) create mode 100644 prepare_abcd.R diff --git a/example.config.yml b/example.config.yml index c2936ef3..aac6ef28 100644 --- a/example.config.yml +++ b/example.config.yml @@ -9,15 +9,16 @@ default: filename_raw: "raw_loanbook_123.csv" filename_scenario_tms: "scenarios_2022_tms.csv" filename_scenario_sda: "scenarios_2022_sda.csv" - filename_abcd: "abcd.csv" + filename_abcd: "abcd.xlsx" + sheet_abcd: "Company Indicators - PACTA Comp" project_parameters: scenario_source: "weo_2022" scenario_select: "nze_2050" region_select: "global" # normally the start year should correspond with year of the publication of # the scenario in use - start_year_select: 2022 - time_frame_select: 5 + start_year: 2022 + time_frame: 5 # regions must be available for the selected scenario benchmark_regions_select: "global,european union" remove_inactive_companies: TRUE @@ -42,6 +43,8 @@ default: use_own_sector_classification: FALSE dir_own_sector_classification: "path/to/own_sector_classification_folder" filename_own_sector_classification: "own_sector_classification.csv" + prepare_abcd: + remove_inactive_companies: TRUE diff --git a/expected_columns.R b/expected_columns.R index daa66362..00ecdb37 100644 --- a/expected_columns.R +++ b/expected_columns.R @@ -32,22 +32,22 @@ col_types_scenario_sda <- readr::cols_only( col_select_scenario_sda <- names(col_types_scenario_sda[["cols"]]) # expected columns abcd file -col_types_abcd <- readr::cols_only( - company_id = "i", - name_company = "c", - lei = "c", - is_ultimate_owner = "l", - sector = "c", - technology = "c", - plant_location = "c", - year = "i", - production = "n", - production_unit = "c", - emission_factor = "n", - emission_factor_unit = "c", - ald_timestamp = "c" +cols_abcd <- tibble::tribble( + ~col_names_abcd, ~col_types_abcd, + "company_id", "numeric", + "name_company", "text", + "lei", "text", + "is_ultimate_owner", "logical", + "sector", "text", + "technology", "text", + "plant_location", "text", + "year", "numeric", + "production", "numeric", + "production_unit", "text", + "emission_factor", "numeric", + "emission_factor_unit", "text", + "ald_timestamp", "text" ) -col_select_abcd <- names(col_types_abcd[["cols"]]) # expected columns matched_prioritized_all_groups file col_types_matched_prio_all_groups <- readr::cols_only( diff --git a/prepare_abcd.R b/prepare_abcd.R new file mode 100644 index 00000000..a4713616 --- /dev/null +++ b/prepare_abcd.R @@ -0,0 +1,167 @@ +# set up project and load packages---- +library(dplyr, warn.conflicts = FALSE) +library(readr) +library(readxl) +library(tidyr) + +# source helpers---- +source("expected_columns.R") + +# load config---- +config_dir <- config::get("directories") +config_files <- config::get("file_names") + +path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd) +sheet_abcd <- config_files$sheet_abcd + +config_prepare_abcd <- config::get("prepare_abcd") + +prepare_abcd_rm_inactive_companies <- config_prepare_abcd$remove_inactive_companies + +config_project_parameters <- config::get("project_parameters") + +project_parameters_start_year <- config_project_parameters$start_year +project_parameters_time_frame <- config_project_parameters$time_frame + +# validate config values---- +if (!length(path_abcd) == 1) { + stop("Argument path_abcd must be of length 1. Please check your input.") +} +if (!inherits(path_abcd, "character")) { + stop("Argument path_abcd must be of class character. Please check your input.") +} +if (!length(sheet_abcd) == 1) { + stop("Argument sheet_abcd must be of length 1. Please check your input.") +} +if (!inherits(sheet_abcd, "character")) { + stop("Argument sheet_abcd must be of class character. Please check your input.") +} +if (!is.null(prepare_abcd_rm_inactive_companies)) { + if (!length(prepare_abcd_rm_inactive_companies) == 1) { + stop("Argument prepare_abcd_rm_inactive_companies must be of length 1. Please check your input.") + } + if (!inherits(prepare_abcd_rm_inactive_companies, "logical")) { + stop("Argument prepare_abcd_rm_inactive_companies must be of class logical. Please check your input.") + } +} +if (!length(project_parameters_start_year) == 1) { + stop("Argument project_parameters_start_year must be of length 1. Please check your input.") +} +if (!inherits(project_parameters_start_year, "integer")) { + stop("Argument project_parameters_start_year must be of class integer Please check your input.") +} +if (!length(project_parameters_time_frame) == 1) { + stop("Argument project_parameters_time_frame must be of length 1. Please check your input.") +} +if (!inherits(project_parameters_time_frame, "integer")) { + stop("Argument project_parameters_time_frame must be of class integer Please check your input.") +} + + +# load data---- +if (!file.exists(path_abcd)) { + stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!")) +} + +abcd <- readxl::read_xlsx( + path = file.path(path_abcd), + sheet = sheet_abcd, + col_types = cols_abcd$col_types_abcd +) +if (!all(cols_abcd$col_names_abcd %in% names(abcd))) { + stop("Columns in abcd do not match expected input names. Please check your input.") +} + +# optional: remove inactive companies---- + +# remove company-sector combinations where production in t5 = 0 when +# it was greater than 0 in t0. +rm_inactive_companies <- function(data, + start_year, + time_frame) { + data_no_prod_t5 <- data %>% + dplyr::filter( + year %in% c(.env$start_year, .env$start_year + .env$time_frame) + ) %>% + dplyr::summarise( + sum_production = sum(.data$production, na.rm = TRUE), + .by = c("name_company", "sector", "year") + ) %>% + tidyr::pivot_wider( + names_from = "year", + names_prefix = "prod_", + values_from = "sum_production" + ) %>% + dplyr::filter( + !!rlang::sym(paste0("prod_", start_year)) > 0, + !!rlang::sym(paste0("prod_", start_year + time_frame)) == 0 + ) + + comp_sec_no_prod_t5 <- data_no_prod_t5 %>% + dplyr::distinct( + .data$name_company, + .data$sector + ) + + data_no_prod_t0_to_t5 <- data %>% + dplyr::filter( + year %in% c(.env$start_year, .env$start_year + .env$time_frame) + ) %>% + dplyr::summarise( + sum_production = sum(.data$production, na.rm = TRUE), + .by = c("name_company", "sector") + ) %>% + dplyr::filter( + .data$sum_production == 0 + ) + + comp_sec_no_prod_t0_to_t5 <- data_no_prod_t0_to_t5 %>% + dplyr::distinct( + .data$name_company, + .data$sector + ) + + data <- data %>% + dplyr::anti_join( + comp_sec_no_prod_t5, + by = c("name_company", "sector") + ) %>% + dplyr::anti_join( + comp_sec_no_prod_t0_to_t5, + by = c("name_company", "sector") + ) + + return(data) +} + +if (prepare_abcd_rm_inactive_companies) { + abcd_keep <- abcd %>% + rm_inactive_companies( + start_year = project_parameters_start_year, + time_frame = project_parameters_time_frame + ) + + abcd_removed <- abcd %>% + dplyr::anti_join( + abcd_keep, + by = c("company_id", "sector") + ) + + # write removed inactive companies to file for inspection + abcd_removed %>% + readr::write_csv( + file.path(config_dir$dir_abcd, "abcd_removed_inactive_companies.csv"), + na = "" + ) + + abcd <- abcd_keep + + rm(abcd_keep) +} + +# write final version of abcd to file for use PACTA analysis +abcd %>% + readr::write_csv( + file.path(config_dir$dir_abcd, "abcd_final.csv"), + na = "" + ) diff --git a/run_matching.R b/run_matching.R index eb7d62b3..baa52775 100644 --- a/run_matching.R +++ b/run_matching.R @@ -3,6 +3,7 @@ library(dplyr, warn.conflicts = FALSE) library(r2dii.data) library(r2dii.match) library(readr) +library(readxl) library(withr) # source helpers---- @@ -15,6 +16,7 @@ config_files <- config::get("file_names") dir_raw <- config_dir$dir_raw path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd) +sheet_abcd <- config_files$sheet_abcd dir_matched <- config_dir$dir_matched config_matching <- config::get("matching") @@ -113,11 +115,14 @@ if (!file.exists(path_abcd)) { stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!")) } -abcd <- readr::read_csv( - file.path(path_abcd), - col_types = col_types_abcd, - col_select = dplyr::all_of(col_select_abcd) +abcd <- readxl::read_xlsx( + path = file.path(path_abcd), + sheet = sheet_abcd, + col_types = cols_abcd$col_types_abcd ) +if (!all(cols_abcd$col_names_abcd %in% names(abcd))) { + stop("Columns in abcd do not match expected input names. Please check your input.") +} ## optionally load own classification system---- if (matching_use_own_sector_classification) { From 904d28f06c1f11ce09eb3a9b826287f9f35f5881 Mon Sep 17 00:00:00 2001 From: jacobvjk Date: Wed, 3 Apr 2024 09:44:45 +0200 Subject: [PATCH 2/2] simplify --- prepare_abcd.R | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/prepare_abcd.R b/prepare_abcd.R index a4713616..79aa0d20 100644 --- a/prepare_abcd.R +++ b/prepare_abcd.R @@ -74,12 +74,14 @@ if (!all(cols_abcd$col_names_abcd %in% names(abcd))) { # optional: remove inactive companies---- -# remove company-sector combinations where production in t5 = 0 when +# (1) remove company-sector combinations where production in t5 = 0 when # it was greater than 0 in t0. +# (2) remove company-sector combinations where production is 0 for the entire +# time frame from t0 to t5. rm_inactive_companies <- function(data, start_year, time_frame) { - data_no_prod_t5 <- data %>% + comp_sec_no_prod_t5 <- data %>% dplyr::filter( year %in% c(.env$start_year, .env$start_year + .env$time_frame) ) %>% @@ -95,15 +97,13 @@ rm_inactive_companies <- function(data, dplyr::filter( !!rlang::sym(paste0("prod_", start_year)) > 0, !!rlang::sym(paste0("prod_", start_year + time_frame)) == 0 - ) - - comp_sec_no_prod_t5 <- data_no_prod_t5 %>% + ) %>% dplyr::distinct( .data$name_company, .data$sector ) - data_no_prod_t0_to_t5 <- data %>% + comp_sec_no_prod_t0_to_t5 <- data %>% dplyr::filter( year %in% c(.env$start_year, .env$start_year + .env$time_frame) ) %>% @@ -113,9 +113,7 @@ rm_inactive_companies <- function(data, ) %>% dplyr::filter( .data$sum_production == 0 - ) - - comp_sec_no_prod_t0_to_t5 <- data_no_prod_t0_to_t5 %>% + ) %>% dplyr::distinct( .data$name_company, .data$sector