From 530f912e4f83a8d8dd8d766a674d070aea588099 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 12:27:01 +0100 Subject: [PATCH] feat: `match_name` gains `join_id` col, allowing for an initial matching override based on some unique ID column (#460) --- NEWS.md | 2 + R/match_name.R | 169 ++++++++++++++++++++++++++++--- R/restructure_abcd.R | 9 +- man/match_name.Rd | 25 +++++ tests/testthat/test-match_name.R | 121 ++++++++++++++++++++++ tests/testthat/test-prioritize.R | 10 ++ 6 files changed, 320 insertions(+), 16 deletions(-) diff --git a/NEWS.md b/NEWS.md index 309f1189..01fc90f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # r2dii.match (development version) +* `match_name` gains argument `join_id` allowing an optional perfect join based on a mutual ID column between `loanbook` and `abcd` inputs, prior to attempting fuzzy matching (#135). + # r2dii.match 0.1.4 * `to_alias` can now handle strange encodings without error (#425, @kalashsinghal @Tilmon). diff --git a/R/match_name.R b/R/match_name.R index a2910b81..3a4f3c80 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -31,6 +31,10 @@ #' only `sector`, the value in the `name` column should be `NA` and #' vice-versa. This file can be used to manually match loanbook companies to #' abcd. +#' @param join_id A join specification passed to [dplyr::inner_join()]. If a +#' character string, it assumes identical join columns between `loanbook` and +#' `abcd`. If a named character vector, it uses the name as the join column of `loanbook` and +#' the value as the join column of `abcd`. #' @param ... Arguments passed on to [stringdist::stringsim()]. #' @param ald `r lifecycle::badge('superseded')` `ald` has been superseded by #' `abcd`. @@ -83,6 +87,25 @@ #' code_system = "XYZ" #' ) #' +#' # match on LEI +#' loanbook <- tibble( +#' sector_classification_system = "XYZ", +#' sector_classification_direct_loantaker = "D35.11", +#' id_ultimate_parent = "UP15", +#' name_ultimate_parent = "Won't fuzzy match", +#' id_direct_loantaker = "C294", +#' name_direct_loantaker = "Won't fuzzy match", +#' lei_direct_loantaker = "LEI123" +#' ) +#' +#' abcd <- tibble( +#' name_company = "alpine knits india pvt. limited", +#' sector = "power", +#' lei = "LEI123" +#' ) +#' +#' match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei")) +#' #' restore <- options(r2dii.match.sector_classifications = your_classifications) #' #' loanbook <- tibble( @@ -111,6 +134,7 @@ match_name <- function(loanbook, method = "jw", p = 0.1, overwrite = NULL, + join_id = NULL, ald = deprecated(), ...) { restore <- options(datatable.allow.cartesian = TRUE) @@ -125,16 +149,77 @@ match_name <- function(loanbook, abcd <- ald } - match_name_impl( - loanbook = loanbook, - abcd = abcd, - by_sector = by_sector, - min_score = min_score, - method = method, - p = p, - overwrite = overwrite, - ... - ) + if (!is.null(join_id)) { + check_join_id(join_id, loanbook, abcd) + + crucial_names <- c("name_company", "sector", join_id) + check_crucial_names(abcd, crucial_names) + + prep_abcd <- dplyr::transmute( + abcd, + name_abcd = .data[["name_company"]], + sector_abcd = tolower(.data[["sector"]]), + !!join_id := .data[[join_id]] + ) + + prep_abcd <- dplyr::distinct(prep_abcd) + + prep_lbk <- may_add_sector_and_borderline(loanbook) + prep_lbk <- distinct(prep_lbk) + + join_matched <- dplyr::inner_join( + prep_lbk, + prep_abcd, + by = join_id, + na_matches = "never" + ) + + join_by_list <- as_join_by(join_id) + loanbook_join_id <- join_by_list[[1]] + + join_matched <- dplyr::mutate( + join_matched, + score = 1, + source = "id joined", + level = loanbook_join_id, + name = .data[["name_abcd"]] + ) + + loanbook <- dplyr::filter( + loanbook, + !.data[[loanbook_join_id]] %in% join_matched[[loanbook_join_id]] + ) + } + + if (nrow(loanbook) != 0) { + fuzzy_matched <- match_name_impl( + loanbook = loanbook, + abcd = abcd, + by_sector = by_sector, + min_score = min_score, + method = method, + p = p, + overwrite = overwrite, + ... + ) + } else { + fuzzy_matched <- tibble() + } + + if (exists("join_matched")) { + out <- dplyr::bind_rows(join_matched, fuzzy_matched) + } else if (nrow(fuzzy_matched) == 0 && exists("join_matched")) { + out <- join_matched + } else { + out <- fuzzy_matched + } + + if (identical(nrow(out), 0L)) { + rlang::warn("Found no match.") + return(empty_loanbook_tibble(loanbook, dplyr::groups(loanbook))) + } + + out } match_name_impl <- function(loanbook, @@ -145,6 +230,7 @@ match_name_impl <- function(loanbook, p = 0.1, overwrite = NULL, ...) { + old_groups <- dplyr::groups(loanbook) loanbook <- ungroup(loanbook) @@ -164,8 +250,8 @@ match_name_impl <- function(loanbook, setDT(a) if (identical(nrow(a), 0L)) { - rlang::warn("Found no match.") - return(empty_loanbook_tibble(loanbook, old_groups)) + rlang::inform("Found no match via fuzzy matching.") + return(a) } a <- unique(a)[ @@ -179,8 +265,8 @@ match_name_impl <- function(loanbook, a <- a[score >= min_score, ] if (identical(nrow(a), 0L)) { - rlang::warn("Found no match.") - return(empty_loanbook_tibble(loanbook, old_groups)) + rlang::inform("Found no match via fuzzy matching.") + return(a) } l <- rename(prep_lbk, alias_lbk = "alias") @@ -349,3 +435,58 @@ names_added_by_match_name <- function() { "borderline" ) } + +check_join_id <- function(join_id, loanbook, abcd) { + + join_id_list <- as_join_by(join_id) + + if (!rlang::has_name(loanbook, join_id_list[[1]])) { + rlang::abort( + "join_id_not_in_loanbook", + message = glue( + "The join_id `{join_id_list[[1]]}` must be present in `loanbook` input." + ) + ) + } else if (!rlang::has_name(abcd, join_id_list[[2]])) { + rlang::abort( + "join_id_not_in_abcd", + message = glue( + "The join_id `{join_id_list[[2]]}` must be present in `abcd` input." + ) + ) + } + + invisible(join_id) +} + +as_join_by <- function(x) { + + if (rlang::is_list(x)) { + if (length(x) != 1L) { + rlang::abort("`join_id` must be a vector of length 1.") + } + x_name <- names(x) %||% x + y_name <- unname(x) + } else if (rlang::is_character(x)) { + x_name <- names(x) %||% x + y_name <- unname(x) + + # If x partially named, assume unnamed are the same in both tables + x_name[x_name == ""] <- y_name[x_name == ""] + } else { + rlang::abort("`by` must be a string or a character vector.") + } + + if (!rlang::is_character(x_name)) { + rlang::abort("`by$x` must evaluate to a character vector.") + } + if (!rlang::is_character(y_name)) { + rlang::abort("`by$y` must evaluate to a character vector.") + } + + c(x_name, y_name) +} + +`%||%` <- function(x, y) { + if (is.null(x)) y else x +} diff --git a/R/restructure_abcd.R b/R/restructure_abcd.R index 06c8bd6a..14a273b7 100644 --- a/R/restructure_abcd.R +++ b/R/restructure_abcd.R @@ -5,6 +5,8 @@ #' from values in the `name_company` column. #' #' @param data A data frame. Should be an asset-level dataset. +#' @param join_id (Optional) A string giving the name of an `ID` column to +#' preserve in the restructuring #' #' @seealso [r2dii.data::abcd_demo] `to_alias()`. #' @@ -15,12 +17,15 @@ #' restructure_abcd(r2dii.data::abcd_demo) #' @noRd restructure_abcd <- function(data) { - check_crucial_names(data, c("name_company", "sector")) + crucial_names <- c("name_company", "sector") + check_crucial_names(data, crucial_names) out <- dplyr::transmute( data, - name = .data$name_company, sector = tolower(.data$sector) + name = .data$name_company, + sector = tolower(.data$sector) ) + out <- distinct(out) add_alias(out) } diff --git a/man/match_name.Rd b/man/match_name.Rd index bf4bebaf..a54945ee 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -12,6 +12,7 @@ match_name( method = "jw", p = 0.1, overwrite = NULL, + join_id = NULL, ald = deprecated(), ... ) @@ -39,6 +40,11 @@ only \code{sector}, the value in the \code{name} column should be \code{NA} and vice-versa. This file can be used to manually match loanbook companies to abcd.} +\item{join_id}{A join specification passed to \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}}. If a +character string, it assumes identical join columns between \code{loanbook} and +\code{abcd}. If a named character vector, it uses the name as the join column of \code{loanbook} and +the value as the join column of \code{abcd}.} + \item{ald}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{ald} has been superseded by \code{abcd}.} @@ -124,6 +130,25 @@ your_classifications <- tibble( code_system = "XYZ" ) +# match on LEI +loanbook <- tibble( + sector_classification_system = "XYZ", + sector_classification_direct_loantaker = "D35.11", + id_ultimate_parent = "UP15", + name_ultimate_parent = "Won't fuzzy match", + id_direct_loantaker = "C294", + name_direct_loantaker = "Won't fuzzy match", + lei_direct_loantaker = "LEI123" +) + +abcd <- tibble( + name_company = "alpine knits india pvt. limited", + sector = "power", + lei = "LEI123" +) + +match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei")) + restore <- options(r2dii.match.sector_classifications = your_classifications) loanbook <- tibble( diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index e5b28bb6..136fb950 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -723,3 +723,124 @@ test_that("allows custom `sector_classifications` via options() (#354)", { expect_equal(nrow(out), 1L) options(old) }) + +test_that("with `join_id`, joins as expected (#135)", { + skip_if_r2dii_data_outdated() + loanbook <- fake_lbk( + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = "UP won't fuzzy match", + id_col = "LEI123" + ) + + abcd <- fake_abcd( + id_col = "LEI123" + ) + + # expect no match here, the company names are completely different + expect_warning(match_name(loanbook, abcd, join_id = NULL), "no match") + + # expect exactly one match here, based on input ID + out_with_join_id <- match_name(loanbook, abcd, join_id = "id_col") + expect_equal(nrow(out_with_join_id), 1L) + expect_contains(names(out_with_join_id), "id_col") + expect_equal(unique(out_with_join_id$id_col), "LEI123") + +}) + +test_that("with `join_id` accepts named character vector input indicating different cols (#135)", { + skip_if_r2dii_data_outdated() + loanbook <- fake_lbk( + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = "UP won't fuzzy match", + lei_direct_loantaker = "LEI123" + ) + + abcd <- fake_abcd( + lei = "LEI123" + ) + + out_with_join_id <- match_name( + loanbook, + abcd, + join_id = c(lei_direct_loantaker = "lei") + ) + + expect_equal(nrow(out_with_join_id), 1L) + expect_contains(names(out_with_join_id), "lei_direct_loantaker") + expect_equal(unique(out_with_join_id$lei_direct_loantaker), "LEI123") + +}) + +test_that("with `join_id`, outputs data with loanbook join column (#135)", { + skip_if_r2dii_data_outdated() + # with `join_id` as character + out <- match_name( + fake_lbk(foo = "1"), + fake_abcd(foo = "1"), + join_id = "foo" + ) + + expect_contains(names(out), "foo") + + # with `join_id` as a named vector + out <- match_name( + fake_lbk(foo_lbk = "1"), + fake_abcd(foo_abcd = "1"), + join_id = c(foo_lbk = "foo_abcd") + ) + + expect_contains(names(out), "foo_lbk") + +}) + +test_that("with `join_id` and multiple matches, prefers ID (#135)", { + skip_if_r2dii_data_outdated() + # loanbook might match at UP + loanbook <- fake_lbk( + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = "UP will fuzzy match", + lei_direct_loantaker = "LEI123" + ) + + abcd <- fake_abcd( + lei = "LEI123", + name_company = "UP will fuzzy match" + ) + + # with `join_id` as named vector + out <- match_name( + loanbook, + abcd, + join_id = c(lei_direct_loantaker = "lei") + ) + + expect_equal(unique(out$lei_direct_loantaker), "LEI123") + +}) + +test_that("with `join_id` and one ID match, one fuzzy match, outputs as expected (#135)", { + skip_if_r2dii_data_outdated() + loanbook <- fake_lbk( + id_loan = c("L1", "L2"), # L1 should ID match, L2 should fuzzy match + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = c("UP won't fuzzy match", "UP will fuzzy match"), + lei_direct_loantaker = c("LEI123", NA_character_) + ) + + abcd <- fake_abcd( + name_company = c("a power company", "UP will fuzzy match"), + lei = c("LEI123", NA_character_) + ) + + out <- match_name( + loanbook, + abcd, + join_id = c(lei_direct_loantaker = "lei") + ) + + out <- split(out, out$id_loan) + + expect_equal(out$L1$lei_direct_loantaker, "LEI123") + expect_equal(out$L2$name_abcd, "UP will fuzzy match") + +}) diff --git a/tests/testthat/test-prioritize.R b/tests/testthat/test-prioritize.R index bd90d2c6..33a6d6f1 100644 --- a/tests/testthat/test-prioritize.R +++ b/tests/testthat/test-prioritize.R @@ -235,3 +235,13 @@ test_that("with 0-row input returns 0-row input", { expect_no_error(prioritize(zero_row)) }) + +test_that("with `match_name` with `join_id`, outputs as expected (#135)", { + skip_if_r2dii_data_outdated() + lbk <- fake_lbk(id_loan = "L1", id_col = "1") + abcd <- fake_abcd(id_col = "1") + matched <- match_name(lbk, abcd, join_id = "id_col") + + out <- prioritize(matched) + expect_equal(nrow(out), 1L) +})