Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

new clean_data library of functions for preparing new data #247

Closed
wants to merge 109 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
109 commits
Select commit Hold shift + click to select a range
f87c1b0
initial commit for new clean_data script with tests
Oct 22, 2020
a896605
Merge branch 'master' into new-clean_data-script
Nov 19, 2020
33d56b1
rework the concept
Nov 19, 2020
5858424
get_average_sector_intensity_data
Nov 19, 2020
6719377
lots of changes
Nov 19, 2020
9b51d92
style changes
Nov 19, 2020
18ff134
add more RDS files
Nov 19, 2020
3d301d5
matching functions
Nov 19, 2020
b6d506c
save all files to a directory
Nov 20, 2020
f31f819
add a GitHub workflow to run the tests on this branch
Nov 20, 2020
2dddfa2
add here pkg to test
Nov 20, 2020
9323f17
missed quote
Nov 20, 2020
6ffcb72
test a failing test
Nov 20, 2020
8834ac5
Merge branch 'new-clean_data-script' of github.com:2DegreesInvesting/…
Nov 20, 2020
c13b1c5
echo exit code to signal result to GitHub workflow
Nov 20, 2020
d778604
force fail test
Nov 20, 2020
1e4c069
add stop options to test_file
Nov 20, 2020
445daa8
remove force fail and use Rscript
Nov 20, 2020
8e9a70e
remove echo
Nov 20, 2020
61d01e8
remove test of failing test
Nov 20, 2020
e8eb061
move clean_data tests to their own directory
Nov 20, 2020
835b570
name test file specific to function being tested
Nov 20, 2020
57c41eb
add a setup file for the tests
Nov 20, 2020
eff6b44
test get_company_emissions_data
Nov 20, 2020
ad308f0
test get_currency_data
Nov 20, 2020
087b96d
test get_bics_bridge_data
Nov 20, 2020
e41af73
test get_fin_sector_overrides_data
Nov 20, 2020
2ae9f7d
test get_non_distinct_isins_data
Nov 20, 2020
7065b4d
get get_sector_bridge
Nov 20, 2020
f083c78
test get_average_sector_intensity_data
Nov 20, 2020
3c209be
test get_consolidated_financial_data
Nov 20, 2020
9a4ef39
test get_debt_financial_data
Nov 20, 2020
0c10090
test get_revenue_data
Nov 20, 2020
d4eeda1
test get_security_financial_data
Nov 20, 2020
75225d7
whitespace
Nov 20, 2020
1f83bc3
more tests
Nov 21, 2020
6c8ed7b
updates
Nov 21, 2020
27fddd3
fix save function
Nov 21, 2020
8ca634d
use new functions
Nov 21, 2020
9333177
data validation
Nov 21, 2020
3421c34
namespace dplyr function
Nov 21, 2020
4bc4cd0
tests for validate functions
Nov 21, 2020
58d1f40
improved tests for matching functions
Nov 21, 2020
a40e62e
update sector_bridge
Nov 21, 2020
fd5f9be
improved tests for convert_industry_classification
Nov 21, 2020
40ec9f3
figure out what's wrong with the test
Nov 21, 2020
c0bd8c1
namespace more dplyr functions to pass tests
Nov 22, 2020
06bf6b2
use .data and add argument checking
Nov 22, 2020
949dc4d
seaparate the colspecs into individual functions, use them and test them
Nov 23, 2020
aad942c
towards a proper package system
Nov 23, 2020
df86e67
fix function name
Nov 23, 2020
8c41c6d
add more dependencies that are required by the pkg infrastructure?
Nov 23, 2020
b7e8d5c
get rid of complaints about . and .data
Nov 23, 2020
b94e302
fix spelling
Nov 23, 2020
65b1820
more consistent function names
Nov 23, 2020
6259c67
better test for bics
Nov 23, 2020
31bb182
add and test convert_bclass_to_sector
Nov 23, 2020
3c06fbc
add and test convert_bics_subgroup_to_bics_sector; start using .data
Nov 23, 2020
f4bc6db
use .data
Nov 23, 2020
7f9d61b
use more specific, explicit tests
Nov 23, 2020
a1191fd
return TRUE
Nov 24, 2020
a13e4a5
add and test validate_data_frame_with_more_than_0_rows
Nov 24, 2020
399781b
better testing of fast_match
Nov 24, 2020
8cfcf14
use function name directly
Nov 24, 2020
4c8f78b
improving tests, making them explicit, and removing setwd where possible
Nov 24, 2020
65acc27
better naming for the GitHub action
Nov 24, 2020
01d6665
make tests more explicit and reduce usage of setwd
Nov 24, 2020
5c54607
Merge branch 'master' into new-clean_data-script
Nov 24, 2020
ba57aae
Merge branch 'master' into new-clean_data-script
Nov 24, 2020
9accae8
use @maurolepore's skip_check_but_run_test() to skip tests in R CMD c…
Nov 24, 2020
fb39897
apply new data_object_names to internal data functions
Nov 25, 2020
d7ace7c
finish conforming to the standardized object names
Nov 25, 2020
583fb4a
ease validation of consolidated_financials and security_financials (a…
Nov 25, 2020
6a9f4af
add currency filenamesnand fix vector name
Nov 25, 2020
357e2e0
some cleanup
Nov 25, 2020
140c72f
add funds to filename objects
Nov 25, 2020
eddb3a5
almost there
Nov 25, 2020
8e70061
save files as rds
Nov 26, 2020
e12e038
namespace class to base::get to avoid conflict with config::get
Nov 26, 2020
2f4405e
add more common file names
Nov 26, 2020
7ac743e
more common filenames
Nov 26, 2020
74c531a
remove tests that are no longer valid
Nov 26, 2020
44e5608
conform to new default filenames
Nov 26, 2020
0266887
add tests for values
Nov 26, 2020
5cde319
additional values test
Nov 26, 2020
d10da7a
read in pre-processed files the new way
Nov 26, 2020
819f9c5
fix expectation
Nov 26, 2020
34d714f
add bics sector to portfolio
Nov 26, 2020
c0fdd32
avoid adding bics_sector if it's already there
Nov 26, 2020
98b5a3e
workaround dplyr R CMD check warnings
Nov 27, 2020
aa1c53f
new map_security_sectors()
Nov 27, 2020
06b66b6
rename to conform to default object names
Nov 28, 2020
0214bf7
validate_cleaned_security_financial and helpers
Nov 28, 2020
7230c12
add error collector to basic validation functions
Nov 28, 2020
75ce258
add optional error collector to more fundamental validators
Nov 29, 2020
c786423
add cli to imports
Nov 29, 2020
766d77c
don't load example data when not needed
Nov 29, 2020
b0a8253
add context to error collectors
Nov 29, 2020
2438505
validate_by_name utility
Nov 29, 2020
4775170
use <-, not =
Nov 29, 2020
5a53d9e
rename and test get_and_clean_revenue
Nov 30, 2020
f0c1692
test tha validate_bics_bridge catches duplicate values
Nov 30, 2020
d45d9d9
test using an error collector
Nov 30, 2020
2410367
more error collector testing
Nov 30, 2020
0850a7f
test validate_is_named_character
Nov 30, 2020
00a52d0
more tests
Nov 30, 2020
e6a1f47
do not check type of columns that don't exist
Dec 4, 2020
f57253a
rely on the global variable for the default path
Dec 4, 2020
f3826db
remove test that violates the new interpretation of validate_column_t…
Dec 4, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/test-new-data-clean-functions.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
on:
push:
branches:
- new-clean_data-script
pull_request:
branches:
- new-clean_data-script

name: test new data cleaning functions

jobs:
test-new-data-clean-functions:
runs-on: macOS-latest
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2

- uses: r-lib/actions/setup-r@v1

- name: Install dependencies
run: install.packages(c("testthat", "here", "dplyr", "purrr", "janitor", "fst", "config", "conflicted", "fs", "readr", "renv", "rmarkdown", "usethis"))
shell: Rscript {0}

- name: testthat tests of new data cleaning functions
run: Rscript -e 'testthat::test_local(stop_on_failure = TRUE, stop_on_warning = TRUE)'
296 changes: 148 additions & 148 deletions 0_portfolio_input_check_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -617,32 +617,32 @@ add_fund_portfolio <- function(portfolio, fund_portfolio, cols_of_funds) {
portfolio_total
}

check_funds_wo_bbg <- function(fund_data, fin_data) {

# isin in the fund_data but no bbg data available
fin_data_funds <- fin_data %>%
filter(asset_type == "Funds") %>%
select(isin) %>%
distinct()

fund_isins <- fund_data %>%
select(fund_isin) %>%
distinct()

fund_isins_missing_bbg <- fund_isins %>% filter(!fund_isin %in% fin_data_funds$isin)

known_missing_isins <- read_csv("data/fund_isins_without_bbg_data.csv", col_types = "c")

known_missing_isins <- known_missing_isins %>%
bind_rows(fund_isins_missing_bbg) %>%
distinct()

readr::write_csv(fund_isins_missing_bbg, "data/fund_isins_without_bbg_data.csv")

if (data_check(fund_isins_missing_bbg)) {
print("Warning: There are funds without bbg data. These are excluded from the analysis.")
}
}
# check_funds_wo_bbg <- function(fund_data, fin_data) {
#
# # isin in the fund_data but no bbg data available
# fin_data_funds <- fin_data %>%
# filter(asset_type == "Funds") %>%
# select(isin) %>%
# distinct()
#
# fund_isins <- fund_data %>%
# select(fund_isin) %>%
# distinct()
#
# fund_isins_missing_bbg <- fund_isins %>% filter(!fund_isin %in% fin_data_funds$isin)
#
# known_missing_isins <- read_csv("data/fund_isins_without_bbg_data.csv", col_types = "c")
#
# known_missing_isins <- known_missing_isins %>%
# bind_rows(fund_isins_missing_bbg) %>%
# distinct()
#
# readr::write_csv(fund_isins_missing_bbg, "data/fund_isins_without_bbg_data.csv")
#
# if (data_check(fund_isins_missing_bbg)) {
# print("Warning: There are funds without bbg data. These are excluded from the analysis.")
# }
# }

###

Expand Down Expand Up @@ -869,113 +869,113 @@ get_and_clean_currency_data <- function() {
currencies
}

get_and_clean_fund_data <- function() {
fund_data <- NA
# Fund Data
if (file.exists(paste0(analysis_inputs_path, "/fund_data_", financial_timestamp, ".rda"))) {
fund_data <- readRDS(paste0(analysis_inputs_path, "/fund_data_", financial_timestamp, ".rda"))
} else if (file.exists(paste0(analysis_inputs_path, "/fund_data_2018Q4.rda"))) {
fund_data <- readRDS(paste0(analysis_inputs_path, "/fund_data_2018Q4.rda"))
print("Old Fund Data being used. Replace FundsData2018Q4 or check name of file.")
} else if (file.exists(paste0(analysis_inputs_path, "/SFC_26052020_funds.csv"))) {
fund_data <- read_csv(paste0(analysis_inputs_path, "/SFC_26052020_funds.csv"))
print("2020Q2 SFC fund data being used")
} else {
if (!data_check(fund_data)) {
warning("No fund data available")
}
}

if (data_check(fund_data)) {
fund_data <- fund_data %>% janitor::clean_names()

fund_data <- fund_data %>% filter(!is.na(holding_isin) & holding_isin != "")

fund_data <- normalise_fund_data(fund_data)
}
return(fund_data)
}

get_and_clean_fin_data <- function(fund_data) {

# Financial Data
fin_data_raw <- read_rda(paste0(analysis_inputs_path, "/security_financial_data.rda")) %>% as_tibble()

# remove unclear duplicates from raw financial data. This should be moved to DataStore.
rm_duplicates <- read_csv("non_distinct_isins.csv")
rm_duplicates <- rm_duplicates %>%
distinct(isin) %>%
pull(isin)
fin_data_raw <- fin_data_raw %>%
filter(!(isin %in% rm_duplicates))

if (!unique(fin_data_raw$financial_timestamp) == financial_timestamp) {
print("Financial timestamp not equal")
}

overrides <- read_csv("data/fin_sector_overrides.csv",
col_types = "ccdc"
)

sector_bridge <- read_csv("data/sector_bridge.csv", col_types = "cccccccc")

fin_data <- fin_data_raw

fin_data <- fin_data %>% filter(!is.na(isin))

fin_data <- map_security_sectors(fin_data, sector_bridge)

# Adds in the manual sector classification overrides
fin_data <- override_sector_classification(fin_data, overrides)

# Checks that only eq, cb, funds and others are in the fin_data
fin_data <- check_asset_types(fin_data)

# Checks for other mapped sectors not within the sector lists
fin_data <- check_fin_mapped_sectors(fin_data)

# TODO: find alternative here, calling in data from company financial data
# Cleans and normalises the mapped_to_assets flag
# fin_data <- check_mapped_assets_flag(fin_data)

# Limits the Bonds category to corporate bonds only
fin_data <- convert_corporate_bonds(fin_data)

# Checks whether the bond is sovereign or not
fin_data <- identify_sb(fin_data)

# Checks to ensure all finds are classified as such
fin_data <- classify_all_funds(fin_data)

fin_data <- add_bics_sector(fin_data)

# Select relevant columns
fin_data <- fin_data %>%
select(
company_id, company_name, bloomberg_id, corporate_bond_ticker,
country_of_domicile,
isin,
unit_share_price, exchange_rate_usd,
asset_type, security_type,
security_mapped_sector, security_icb_subsector, security_bics_subgroup, bics_sector, # bclass4,
maturity_date, coupon_value, amount_issued, current_shares_outstanding_all_classes, unit_share_price,
sector_override, sector_boe, subsector_boe, sector_dnb, sector_ipr, subsector_ipr,
is_sb
) %>%
distinct()

### TEST
if (nrow(fin_data) > nrow(fin_data_raw)) {
stop("Additional rows added to fin data")
}

# updates csv file with missing bloomberg data re funds
if (data_check(fund_data)) {
check_funds_wo_bbg(fund_data, fin_data)
}

return(fin_data)
}
# get_and_clean_fund_data <- function() {
# fund_data <- NA
# # Fund Data
# if (file.exists(paste0(analysis_inputs_path, "/fund_data_", financial_timestamp, ".rda"))) {
# fund_data <- readRDS(paste0(analysis_inputs_path, "/fund_data_", financial_timestamp, ".rda"))
# } else if (file.exists(paste0(analysis_inputs_path, "/fund_data_2018Q4.rda"))) {
# fund_data <- readRDS(paste0(analysis_inputs_path, "/fund_data_2018Q4.rda"))
# print("Old Fund Data being used. Replace FundsData2018Q4 or check name of file.")
# } else if (file.exists(paste0(analysis_inputs_path, "/SFC_26052020_funds.csv"))) {
# fund_data <- read_csv(paste0(analysis_inputs_path, "/SFC_26052020_funds.csv"))
# print("2020Q2 SFC fund data being used")
# } else {
# if (!data_check(fund_data)) {
# warning("No fund data available")
# }
# }
#
# if (data_check(fund_data)) {
# fund_data <- fund_data %>% janitor::clean_names()
#
# fund_data <- fund_data %>% filter(!is.na(holding_isin) & holding_isin != "")
#
# fund_data <- normalise_fund_data(fund_data)
# }
# return(fund_data)
# }

# get_and_clean_fin_data <- function(fund_data) {
#
# # Financial Data
# fin_data_raw <- read_rda(paste0(analysis_inputs_path, "/security_financial_data.rda")) %>% as_tibble()
#
# # remove unclear duplicates from raw financial data. This should be moved to DataStore.
# rm_duplicates <- read_csv("non_distinct_isins.csv")
# rm_duplicates <- rm_duplicates %>%
# distinct(isin) %>%
# pull(isin)
# fin_data_raw <- fin_data_raw %>%
# filter(!(isin %in% rm_duplicates))
#
# if (!unique(fin_data_raw$financial_timestamp) == financial_timestamp) {
# print("Financial timestamp not equal")
# }
#
# overrides <- read_csv("data/fin_sector_overrides.csv",
# col_types = "ccdc"
# )
#
# sector_bridge <- read_csv("data/sector_bridge.csv", col_types = "cccccccc")
#
# fin_data <- fin_data_raw
#
# fin_data <- fin_data %>% filter(!is.na(isin))
#
# fin_data <- map_security_sectors(fin_data, sector_bridge)
#
# # Adds in the manual sector classification overrides
# fin_data <- override_sector_classification(fin_data, overrides)
#
# # Checks that only eq, cb, funds and others are in the fin_data
# fin_data <- check_asset_types(fin_data)
#
# # Checks for other mapped sectors not within the sector lists
# fin_data <- check_fin_mapped_sectors(fin_data)
#
# # TODO: find alternative here, calling in data from company financial data
# # Cleans and normalises the mapped_to_assets flag
# # fin_data <- check_mapped_assets_flag(fin_data)
#
# # Limits the Bonds category to corporate bonds only
# fin_data <- convert_corporate_bonds(fin_data)
#
# # Checks whether the bond is sovereign or not
# fin_data <- identify_sb(fin_data)
#
# # Checks to ensure all finds are classified as such
# fin_data <- classify_all_funds(fin_data)
#
# fin_data <- add_bics_sector(fin_data)
#
# # Select relevant columns
# fin_data <- fin_data %>%
# select(
# company_id, company_name, bloomberg_id, corporate_bond_ticker,
# country_of_domicile,
# isin,
# unit_share_price, exchange_rate_usd,
# asset_type, security_type,
# security_mapped_sector, security_icb_subsector, security_bics_subgroup, bics_sector, # bclass4,
# maturity_date, coupon_value, amount_issued, current_shares_outstanding_all_classes, unit_share_price,
# sector_override, sector_boe, subsector_boe, sector_dnb, sector_ipr, subsector_ipr,
# is_sb
# ) %>%
# distinct()
#
# ### TEST
# if (nrow(fin_data) > nrow(fin_data_raw)) {
# stop("Additional rows added to fin data")
# }
#
# # updates csv file with missing bloomberg data re funds
# if (data_check(fund_data)) {
# check_funds_wo_bbg(fund_data, fin_data)
# }
#
# return(fin_data)
# }

add_bics_sector <- function(fin_data) {
bics_bridge <- read_csv("data/bics_bridge.csv")
Expand All @@ -999,21 +999,21 @@ get_and_clean_revenue_data <- function() {
return(revenue_data)
}

get_and_clean_company_fin_data <- function() {
comp_fin_data_raw <- read_rds(paste0(analysis_inputs_path, "/consolidated_financial_data.rda"))

comp_fin_data_raw <- comp_fin_data_raw %>% select(
company_id, company_name, bloomberg_id, country_of_domicile, corporate_bond_ticker, bics_subgroup,
icb_subgroup, financial_sector, has_asset_level_data, has_assets_in_matched_sector, sectors_with_assets, current_shares_outstanding_all_classes,
market_cap, bond_debt_out, financial_timestamp
)

sector_bridge <- read_csv("data/sector_bridge.csv", col_types = "ccc")

comp_fin_data <- map_comp_sectors(comp_fin_data_raw, sector_bridge)

return(comp_fin_data)
}
# get_and_clean_company_fin_data <- function() {
# comp_fin_data_raw <- read_rds(paste0(analysis_inputs_path, "/consolidated_financial_data.rda"))
#
# comp_fin_data_raw <- comp_fin_data_raw %>% select(
# company_id, company_name, bloomberg_id, country_of_domicile, corporate_bond_ticker, bics_subgroup,
# icb_subgroup, financial_sector, has_asset_level_data, has_assets_in_matched_sector, sectors_with_assets, current_shares_outstanding_all_classes,
# market_cap, bond_debt_out, financial_timestamp
# )
#
# sector_bridge <- read_csv("data/sector_bridge.csv", col_types = "ccc")
#
# comp_fin_data <- map_comp_sectors(comp_fin_data_raw, sector_bridge)
#
# return(comp_fin_data)
# }

get_and_clean_debt_fin_data <- function() {
debt_fin_data_raw <- read_rds(paste0(analysis_inputs_path, "/debt_financial_data.rda"))
Expand Down
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ License: MIT + file LICENSE
Depends:
R (>= 3.5)
Imports:
cli,
config,
conflicted,
dplyr,
Expand Down
Loading