Skip to content

Commit

Permalink
Merge pull request #8 from RMI-PACTA/7-prepare-abcd
Browse files Browse the repository at this point in the history
add prepare_abcd script
  • Loading branch information
jacobvjk authored Apr 3, 2024
2 parents 87162d9 + 904d28f commit 7a601dd
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 22 deletions.
9 changes: 6 additions & 3 deletions example.config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@ default:
filename_raw: "raw_loanbook_123.csv"
filename_scenario_tms: "scenarios_2022_tms.csv"
filename_scenario_sda: "scenarios_2022_sda.csv"
filename_abcd: "abcd.csv"
filename_abcd: "abcd.xlsx"
sheet_abcd: "Company Indicators - PACTA Comp"
project_parameters:
scenario_source: "weo_2022"
scenario_select: "nze_2050"
region_select: "global"
# normally the start year should correspond with year of the publication of
# the scenario in use
start_year_select: 2022
time_frame_select: 5
start_year: 2022
time_frame: 5
# regions must be available for the selected scenario
benchmark_regions_select: "global,european union"
remove_inactive_companies: TRUE
Expand All @@ -44,3 +45,5 @@ default:
filename_own_sector_classification: "own_sector_classification.csv"
match_prioritize:
priority: NULL
prepare_abcd:
remove_inactive_companies: TRUE
30 changes: 15 additions & 15 deletions expected_columns.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,22 @@ col_types_scenario_sda <- readr::cols_only(
col_select_scenario_sda <- names(col_types_scenario_sda[["cols"]])

# expected columns abcd file
col_types_abcd <- readr::cols_only(
company_id = "i",
name_company = "c",
lei = "c",
is_ultimate_owner = "l",
sector = "c",
technology = "c",
plant_location = "c",
year = "i",
production = "n",
production_unit = "c",
emission_factor = "n",
emission_factor_unit = "c",
ald_timestamp = "c"
cols_abcd <- tibble::tribble(
~col_names_abcd, ~col_types_abcd,
"company_id", "numeric",
"name_company", "text",
"lei", "text",
"is_ultimate_owner", "logical",
"sector", "text",
"technology", "text",
"plant_location", "text",
"year", "numeric",
"production", "numeric",
"production_unit", "text",
"emission_factor", "numeric",
"emission_factor_unit", "text",
"ald_timestamp", "text"
)
col_select_abcd <- names(col_types_abcd[["cols"]])

# expected columns matched_prioritized_all_groups file
col_types_matched_prio_all_groups <- readr::cols_only(
Expand Down
165 changes: 165 additions & 0 deletions prepare_abcd.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# set up project and load packages----
library(dplyr, warn.conflicts = FALSE)
library(readr)
library(readxl)
library(tidyr)

# source helpers----
source("expected_columns.R")

# load config----
config_dir <- config::get("directories")
config_files <- config::get("file_names")

path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd)
sheet_abcd <- config_files$sheet_abcd

config_prepare_abcd <- config::get("prepare_abcd")

prepare_abcd_rm_inactive_companies <- config_prepare_abcd$remove_inactive_companies

config_project_parameters <- config::get("project_parameters")

project_parameters_start_year <- config_project_parameters$start_year
project_parameters_time_frame <- config_project_parameters$time_frame

# validate config values----
if (!length(path_abcd) == 1) {
stop("Argument path_abcd must be of length 1. Please check your input.")
}
if (!inherits(path_abcd, "character")) {
stop("Argument path_abcd must be of class character. Please check your input.")
}
if (!length(sheet_abcd) == 1) {
stop("Argument sheet_abcd must be of length 1. Please check your input.")
}
if (!inherits(sheet_abcd, "character")) {
stop("Argument sheet_abcd must be of class character. Please check your input.")
}
if (!is.null(prepare_abcd_rm_inactive_companies)) {
if (!length(prepare_abcd_rm_inactive_companies) == 1) {
stop("Argument prepare_abcd_rm_inactive_companies must be of length 1. Please check your input.")
}
if (!inherits(prepare_abcd_rm_inactive_companies, "logical")) {
stop("Argument prepare_abcd_rm_inactive_companies must be of class logical. Please check your input.")
}
}
if (!length(project_parameters_start_year) == 1) {
stop("Argument project_parameters_start_year must be of length 1. Please check your input.")
}
if (!inherits(project_parameters_start_year, "integer")) {
stop("Argument project_parameters_start_year must be of class integer Please check your input.")
}
if (!length(project_parameters_time_frame) == 1) {
stop("Argument project_parameters_time_frame must be of length 1. Please check your input.")
}
if (!inherits(project_parameters_time_frame, "integer")) {
stop("Argument project_parameters_time_frame must be of class integer Please check your input.")
}


# load data----
if (!file.exists(path_abcd)) {
stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!"))
}

abcd <- readxl::read_xlsx(
path = file.path(path_abcd),
sheet = sheet_abcd,
col_types = cols_abcd$col_types_abcd
)
if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
stop("Columns in abcd do not match expected input names. Please check your input.")
}

# optional: remove inactive companies----

# (1) remove company-sector combinations where production in t5 = 0 when
# it was greater than 0 in t0.
# (2) remove company-sector combinations where production is 0 for the entire
# time frame from t0 to t5.
rm_inactive_companies <- function(data,
start_year,
time_frame) {
comp_sec_no_prod_t5 <- data %>%
dplyr::filter(
year %in% c(.env$start_year, .env$start_year + .env$time_frame)
) %>%
dplyr::summarise(
sum_production = sum(.data$production, na.rm = TRUE),
.by = c("name_company", "sector", "year")
) %>%
tidyr::pivot_wider(
names_from = "year",
names_prefix = "prod_",
values_from = "sum_production"
) %>%
dplyr::filter(
!!rlang::sym(paste0("prod_", start_year)) > 0,
!!rlang::sym(paste0("prod_", start_year + time_frame)) == 0
) %>%
dplyr::distinct(
.data$name_company,
.data$sector
)

comp_sec_no_prod_t0_to_t5 <- data %>%
dplyr::filter(
year %in% c(.env$start_year, .env$start_year + .env$time_frame)
) %>%
dplyr::summarise(
sum_production = sum(.data$production, na.rm = TRUE),
.by = c("name_company", "sector")
) %>%
dplyr::filter(
.data$sum_production == 0
) %>%
dplyr::distinct(
.data$name_company,
.data$sector
)

data <- data %>%
dplyr::anti_join(
comp_sec_no_prod_t5,
by = c("name_company", "sector")
) %>%
dplyr::anti_join(
comp_sec_no_prod_t0_to_t5,
by = c("name_company", "sector")
)

return(data)
}

if (prepare_abcd_rm_inactive_companies) {
abcd_keep <- abcd %>%
rm_inactive_companies(
start_year = project_parameters_start_year,
time_frame = project_parameters_time_frame
)

abcd_removed <- abcd %>%
dplyr::anti_join(
abcd_keep,
by = c("company_id", "sector")
)

# write removed inactive companies to file for inspection
abcd_removed %>%
readr::write_csv(
file.path(config_dir$dir_abcd, "abcd_removed_inactive_companies.csv"),
na = ""
)

abcd <- abcd_keep

rm(abcd_keep)
}

# write final version of abcd to file for use PACTA analysis
abcd %>%
readr::write_csv(
file.path(config_dir$dir_abcd, "abcd_final.csv"),
na = ""
)
13 changes: 9 additions & 4 deletions run_matching.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ library(dplyr, warn.conflicts = FALSE)
library(r2dii.data)
library(r2dii.match)
library(readr)
library(readxl)
library(withr)

# source helpers----
Expand All @@ -15,6 +16,7 @@ config_files <- config::get("file_names")

dir_raw <- config_dir$dir_raw
path_abcd <- file.path(config_dir$dir_abcd, config_files$filename_abcd)
sheet_abcd <- config_files$sheet_abcd
dir_matched <- config_dir$dir_matched

config_matching <- config::get("matching")
Expand Down Expand Up @@ -113,11 +115,14 @@ if (!file.exists(path_abcd)) {
stop(glue::glue("No ABCD file found at path {path_abcd}. Please check your project setup!"))
}

abcd <- readr::read_csv(
file.path(path_abcd),
col_types = col_types_abcd,
col_select = dplyr::all_of(col_select_abcd)
abcd <- readxl::read_xlsx(
path = file.path(path_abcd),
sheet = sheet_abcd,
col_types = cols_abcd$col_types_abcd
)
if (!all(cols_abcd$col_names_abcd %in% names(abcd))) {
stop("Columns in abcd do not match expected input names. Please check your input.")
}

## optionally load own classification system----
if (matching_use_own_sector_classification) {
Expand Down

0 comments on commit 7a601dd

Please sign in to comment.