From e32fee6d257bff782d331893b004044d36030a92 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 08:33:41 +0100 Subject: [PATCH 01/28] add ID join option preserving old functionality --- R/match_name.R | 74 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index a2910b81..62ab22ec 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -31,6 +31,9 @@ #' only `sector`, the value in the `name` column should be `NA` and #' vice-versa. This file can be used to manually match loanbook companies to #' abcd. +#' @param join_id A character string naming an ID column used to left join +#' `abcd` to the `loanbook` (e.g. "lei"). ID column must be present, as named, +#' in both input datasets. #' @param ... Arguments passed on to [stringdist::stringsim()]. #' @param ald `r lifecycle::badge('superseded')` `ald` has been superseded by #' `abcd`. @@ -111,6 +114,7 @@ match_name <- function(loanbook, method = "jw", p = 0.1, overwrite = NULL, + join_id = NULL, ald = deprecated(), ...) { restore <- options(datatable.allow.cartesian = TRUE) @@ -125,9 +129,28 @@ match_name <- function(loanbook, abcd <- ald } - match_name_impl( + prep_abcd <- restructure_abcd(abcd) + + if (!is.null(join_id)) { + check_join_id(join_id, loanbook, prep_abcd) + + join_matched <- dplyr::inner_join( + loanbook, prep_abcd, by = join_id + ) + join_matched <- dplyr::mutate( + join_matched, + !!join_id := NULL, + score = 1 + ) + + loanbook <- dplyr::filter( + loanbook, !.data[[join_id]] %in% join_matched[[join_id]] + ) + } + + out <- match_name_impl( loanbook = loanbook, - abcd = abcd, + prep_abcd = prep_abcd, by_sector = by_sector, min_score = min_score, method = method, @@ -135,16 +158,30 @@ match_name <- function(loanbook, overwrite = overwrite, ... ) + + if (nrow(out) != 0 && exists("join_matched")) { + out <- dplyr::bind_rows(join_matched, out) + } else if (nrow(out) == 0 && exists("join_matched")) { + out <- join_matched + } + + if (identical(nrow(out), 0L)) { + rlang::warn("Found no match.") + return(empty_loanbook_tibble(loanbook, dplyr::groups(loanbook))) + } + + out } match_name_impl <- function(loanbook, - abcd, + prep_abcd, by_sector = TRUE, min_score = 0.8, method = "jw", p = 0.1, overwrite = NULL, ...) { + old_groups <- dplyr::groups(loanbook) loanbook <- ungroup(loanbook) @@ -153,7 +190,6 @@ match_name_impl <- function(loanbook, loanbook_rowid <- tibble::rowid_to_column(loanbook) prep_lbk <- restructure_loanbook(loanbook_rowid, overwrite = overwrite) - prep_abcd <- restructure_abcd(abcd) if (by_sector) { a <- expand_alias(prep_lbk, prep_abcd) @@ -164,8 +200,8 @@ match_name_impl <- function(loanbook, setDT(a) if (identical(nrow(a), 0L)) { - rlang::warn("Found no match.") - return(empty_loanbook_tibble(loanbook, old_groups)) + rlang::inform("Found no match via fuzzy matching.") + return(a) } a <- unique(a)[ @@ -179,8 +215,8 @@ match_name_impl <- function(loanbook, a <- a[score >= min_score, ] if (identical(nrow(a), 0L)) { - rlang::warn("Found no match.") - return(empty_loanbook_tibble(loanbook, old_groups)) + rlang::inform("Found no match via fuzzy matching.") + return(a) } l <- rename(prep_lbk, alias_lbk = "alias") @@ -349,3 +385,25 @@ names_added_by_match_name <- function() { "borderline" ) } + +check_join_id <- function(join_id, loanbook, abcd) { + stopifnot(is.character(join_id)) + + if (!rlang::has_name(loanbook, join_id)) { + rlang::abort( + "join_id_not_in_loanbook", + message = glue( + "The join_id `{join_id}` must be present in both `loanbook` and `abcd`. It's not present in `loanbook`." + ) + ) + } else if (!rlang::has_name(abcd, join_id)) { + rlang::abort( + "join_id_not_in_abcd", + message = glue( + "The join_id `{join_id}` must be present in both `loanbook` and `abcd`. It's not present in `abcd`." + ) + ) + } + + invisible(join_id) +} From d7e91c0c011c2a6749eef384fae795035a1fbe36 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 08:56:28 +0100 Subject: [PATCH 02/28] restructure_abcd gains join_id col --- R/match_name.R | 2 +- R/restructure_abcd.R | 28 ++++++++++++++++++++++------ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index 62ab22ec..57985196 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -129,7 +129,7 @@ match_name <- function(loanbook, abcd <- ald } - prep_abcd <- restructure_abcd(abcd) + prep_abcd <- restructure_abcd(abcd, join_id) if (!is.null(join_id)) { check_join_id(join_id, loanbook, prep_abcd) diff --git a/R/restructure_abcd.R b/R/restructure_abcd.R index 06c8bd6a..095181d8 100644 --- a/R/restructure_abcd.R +++ b/R/restructure_abcd.R @@ -5,6 +5,8 @@ #' from values in the `name_company` column. #' #' @param data A data frame. Should be an asset-level dataset. +#' @param join_id (Optional) A string giving the name of an `ID` column to +#' preserve in the restructuring #' #' @seealso [r2dii.data::abcd_demo] `to_alias()`. #' @@ -14,13 +16,27 @@ #' @examples #' restructure_abcd(r2dii.data::abcd_demo) #' @noRd -restructure_abcd <- function(data) { - check_crucial_names(data, c("name_company", "sector")) +restructure_abcd <- function(data, join_id = NULL) { + crucial_names <- c("name_company", "sector") + if (!is.null(join_id)) { + check_crucial_names(data, c(join_id, crucial_names)) + + out <- dplyr::transmute( + data, + name = .data$name_company, + sector = tolower(.data$sector), + !!join_id := .data[[join_id]] + ) + } else { + check_crucial_names(data, crucial_names) + + out <- dplyr::transmute( + data, + name = .data$name_company, + sector = tolower(.data$sector) + ) + } - out <- dplyr::transmute( - data, - name = .data$name_company, sector = tolower(.data$sector) - ) out <- distinct(out) add_alias(out) } From a23be0870462b3deec0ba1ed966b7e0005061281 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 09:04:05 +0100 Subject: [PATCH 03/28] new argument gains some tests --- tests/testthat/test-match_name.R | 33 ++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index e5b28bb6..6878ec23 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -723,3 +723,36 @@ test_that("allows custom `sector_classifications` via options() (#354)", { expect_equal(nrow(out), 1L) options(old) }) + +test_that("`join_id` optionally joins as expected (#135)", { + + loanbook <- tibble( + sector_classification_system = "NACE", + sector_classification_direct_loantaker = "100", # this generally shouldn't match to anything + id_ultimate_parent = c("UP15", "UP16"), + name_ultimate_parent = c("Foo", "Bar"), + id_direct_loantaker = c("C294", "C295"), + name_direct_loantaker = "Baz", + id_col = c("LEI123", NA_character_) + ) + + abcd <- tibble( + name_company = "alpine knits india pvt. limited", + sector = "power", + id_col = "LEI123" + ) + + # expect no match here, the company names are completely different + expect_warning(match_name(loanbook, abcd, join_id = NULL), "no match") + + # expect exactly one match here, based on input ID + out_with_join_id <- match_name(loanbook, abcd, join_id = "id_col") + expect_equal(nrow(out_with_join_id), 1L) + + # should work for id_col with any name, e.g. `lei` + loanbook_lei <- rename(loanbook, lei = id_col) + abcd_lei <- rename(abcd, lei = id_col) + out_lei <- match_name(loanbook_lei, abcd_lei, join_id = "lei") + expect_equal(nrow(out_lei), 1L) + +}) From 4af13abcf6020f549aee3dfe2049e1d9fb940af1 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 09:06:05 +0100 Subject: [PATCH 04/28] update NEWS.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 309f1189..01fc90f6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # r2dii.match (development version) +* `match_name` gains argument `join_id` allowing an optional perfect join based on a mutual ID column between `loanbook` and `abcd` inputs, prior to attempting fuzzy matching (#135). + # r2dii.match 0.1.4 * `to_alias` can now handle strange encodings without error (#425, @kalashsinghal @Tilmon). From 841794c80e91fc749c9f55bed2c6e483e6335484 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 09:06:21 +0100 Subject: [PATCH 05/28] document --- man/match_name.Rd | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/man/match_name.Rd b/man/match_name.Rd index bf4bebaf..d78733c6 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -12,6 +12,7 @@ match_name( method = "jw", p = 0.1, overwrite = NULL, + join_id = NULL, ald = deprecated(), ... ) @@ -39,6 +40,10 @@ only \code{sector}, the value in the \code{name} column should be \code{NA} and vice-versa. This file can be used to manually match loanbook companies to abcd.} +\item{join_id}{A character string naming an ID column used to left join +\code{abcd} to the \code{loanbook} (e.g. "lei"). ID column must be present, as named, +in both input datasets.} + \item{ald}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{ald} has been superseded by \code{abcd}.} From a44a868731e86041f98f48ae1e6c45a329ace637 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 17:03:20 +0100 Subject: [PATCH 06/28] expect output contains optional join_by col --- tests/testthat/test-match_name.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 6878ec23..befad7c1 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -724,6 +724,8 @@ test_that("allows custom `sector_classifications` via options() (#354)", { options(old) }) + + test_that("`join_id` optionally joins as expected (#135)", { loanbook <- tibble( @@ -756,3 +758,15 @@ test_that("`join_id` optionally joins as expected (#135)", { expect_equal(nrow(out_lei), 1L) }) + +test_that("with `join_id`, outputs data with join column (#135)", { + + out <- match_name( + fake_lbk(foo = "1"), + fake_abcd(foo = "1"), + join_id = "foo" + ) + + expect_contains(names(out), "foo") + +}) From 08f6b3ab9fe1ac265a43306c00595a3918aabdc7 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 7 Mar 2024 17:03:27 +0100 Subject: [PATCH 07/28] dont remove join_by column --- R/match_name.R | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index 57985196..b8b86b54 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -137,31 +137,33 @@ match_name <- function(loanbook, join_matched <- dplyr::inner_join( loanbook, prep_abcd, by = join_id ) - join_matched <- dplyr::mutate( - join_matched, - !!join_id := NULL, - score = 1 - ) + + join_matched <- dplyr::mutate(join_matched, score = 1) loanbook <- dplyr::filter( - loanbook, !.data[[join_id]] %in% join_matched[[join_id]] - ) + loanbook, + !.data[[join_id]] %in% join_matched[[join_id]] + ) } - out <- match_name_impl( - loanbook = loanbook, - prep_abcd = prep_abcd, - by_sector = by_sector, - min_score = min_score, - method = method, - p = p, - overwrite = overwrite, - ... - ) + if (nrow(loanbook) != 0) { + fuzzy_matched <- match_name_impl( + loanbook = loanbook, + prep_abcd = prep_abcd, + by_sector = by_sector, + min_score = min_score, + method = method, + p = p, + overwrite = overwrite, + ... + ) + } else { + fuzzy_matched <- tibble() + } - if (nrow(out) != 0 && exists("join_matched")) { - out <- dplyr::bind_rows(join_matched, out) - } else if (nrow(out) == 0 && exists("join_matched")) { + if (exists("join_matched")) { + out <- dplyr::bind_rows(join_matched, fuzzy_matched) + } else if (nrow(fuzzy_matched) == 0 && exists("join_matched")) { out <- join_matched } From 0a674331e8b158c55f091ff5da937c1cd63dc363 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Mon, 11 Mar 2024 09:38:10 +0100 Subject: [PATCH 08/28] gains as_join_by (from dplyr) --- R/match_name.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/R/match_name.R b/R/match_name.R index b8b86b54..6d1bb20b 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -409,3 +409,35 @@ check_join_id <- function(join_id, loanbook, abcd) { invisible(join_id) } + +as_join_by <- function(x) { + + if (rlang::is_list(x)) { + if (length(x) != 1L) { + rlang::abort("`join_id` must be a list of length 1.") + } + x_name <- names(x) %||% x + y_name <- unname(x) + } else if (rlang::is_character(x)) { + x_name <- names(x) %||% x + y_name <- unname(x) + + # If x partially named, assume unnamed are the same in both tables + x_name[x_name == ""] <- y_name[x_name == ""] + } else { + rlang::abort("`by` must be a list or a character vector.") + } + + if (!rlang::is_character(x_name)) { + rlang::abort("`by$x` must evaluate to a character vector.") + } + if (!rlang::is_character(y_name)) { + rlang::abort("`by$y` must evaluate to a character vector.") + } + + c(x_name, y_name) +} + +`%||%` <- function(x, y) { + if (is.null(x)) y else x +} From 4490e634e902da6b8fc0bcc5dfc4a90fcc9097fa Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Mon, 11 Mar 2024 10:00:56 +0100 Subject: [PATCH 09/28] accept named list input --- R/match_name.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index 6d1bb20b..8c37fb09 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -140,9 +140,12 @@ match_name <- function(loanbook, join_matched <- dplyr::mutate(join_matched, score = 1) + join_by_list <- as_join_by(join_id) + loanbook_join_id <- join_by_list[[1]] + loanbook <- dplyr::filter( loanbook, - !.data[[join_id]] %in% join_matched[[join_id]] + !.data[[loanbook_join_id]] %in% join_matched[[loanbook_join_id]] ) } @@ -165,6 +168,8 @@ match_name <- function(loanbook, out <- dplyr::bind_rows(join_matched, fuzzy_matched) } else if (nrow(fuzzy_matched) == 0 && exists("join_matched")) { out <- join_matched + } else { + out <- fuzzy_matched } if (identical(nrow(out), 0L)) { @@ -389,16 +394,17 @@ names_added_by_match_name <- function() { } check_join_id <- function(join_id, loanbook, abcd) { - stopifnot(is.character(join_id)) - if (!rlang::has_name(loanbook, join_id)) { + join_id_list <- as_join_by(join_id) + + if (!rlang::has_name(loanbook, join_id_list[[1]])) { rlang::abort( "join_id_not_in_loanbook", message = glue( "The join_id `{join_id}` must be present in both `loanbook` and `abcd`. It's not present in `loanbook`." ) ) - } else if (!rlang::has_name(abcd, join_id)) { + } else if (!rlang::has_name(abcd, join_id_list[[2]])) { rlang::abort( "join_id_not_in_abcd", message = glue( From e3d71edd5ff948b5a20a388fa1c512fbab3c3dd8 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Mon, 11 Mar 2024 10:00:59 +0100 Subject: [PATCH 10/28] refactor tests --- tests/testthat/test-match_name.R | 57 +++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index befad7c1..e9417d8b 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -724,23 +724,15 @@ test_that("allows custom `sector_classifications` via options() (#354)", { options(old) }) - - test_that("`join_id` optionally joins as expected (#135)", { - loanbook <- tibble( - sector_classification_system = "NACE", - sector_classification_direct_loantaker = "100", # this generally shouldn't match to anything - id_ultimate_parent = c("UP15", "UP16"), - name_ultimate_parent = c("Foo", "Bar"), - id_direct_loantaker = c("C294", "C295"), - name_direct_loantaker = "Baz", - id_col = c("LEI123", NA_character_) - ) + loanbook <- fake_lbk( + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = "UP won't fuzzy match", + id_col = "LEI123" + ) - abcd <- tibble( - name_company = "alpine knits india pvt. limited", - sector = "power", + abcd <- fake_abcd( id_col = "LEI123" ) @@ -750,17 +742,33 @@ test_that("`join_id` optionally joins as expected (#135)", { # expect exactly one match here, based on input ID out_with_join_id <- match_name(loanbook, abcd, join_id = "id_col") expect_equal(nrow(out_with_join_id), 1L) +}) + +test_that("with `join_id` accepts list input indicating different cols", { - # should work for id_col with any name, e.g. `lei` - loanbook_lei <- rename(loanbook, lei = id_col) - abcd_lei <- rename(abcd, lei = id_col) - out_lei <- match_name(loanbook_lei, abcd_lei, join_id = "lei") - expect_equal(nrow(out_lei), 1L) + loanbook <- fake_lbk( + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = "UP won't fuzzy match", + lei_direct_loantaker = "LEI123" + ) + + abcd <- fake_abcd( + lei = "LEI123" + ) + + out_with_join_id <- match_name( + loanbook, + abcd, + join_id = c(lei_direct_loantaker = "lei") + ) + + expect_equal(nrow(out_with_join_id), 1L) }) -test_that("with `join_id`, outputs data with join column (#135)", { +test_that("with `join_id`, outputs data with loanbook join column (#135)", { + # with `join_id` as character out <- match_name( fake_lbk(foo = "1"), fake_abcd(foo = "1"), @@ -769,4 +777,13 @@ test_that("with `join_id`, outputs data with join column (#135)", { expect_contains(names(out), "foo") + # with `join_id` as named list + out <- match_name( + fake_lbk(foo_lbk = "1"), + fake_abcd(foo_abcd = "1"), + join_id = c(foo_lbk = "foo_abcd") + ) + + expect_contains(names(out), "foo_lbk") + }) From 2d3e62fedc71d169a799f4d888423031a55748e7 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Mon, 11 Mar 2024 10:04:40 +0100 Subject: [PATCH 11/28] update docs --- R/match_name.R | 7 ++++--- man/match_name.Rd | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index 8c37fb09..f617c8ee 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -31,9 +31,10 @@ #' only `sector`, the value in the `name` column should be `NA` and #' vice-versa. This file can be used to manually match loanbook companies to #' abcd. -#' @param join_id A character string naming an ID column used to left join -#' `abcd` to the `loanbook` (e.g. "lei"). ID column must be present, as named, -#' in both input datasets. +#' @param join_id A join specification passe to [dplyr::inner_join()]. If a +#' character string, it assumes identical join columns between `loanbook` and +#' `abcd`. If a named list, uses the name as the join column of `loanbook` and +#' the value as the join column of `abcd`. #' @param ... Arguments passed on to [stringdist::stringsim()]. #' @param ald `r lifecycle::badge('superseded')` `ald` has been superseded by #' `abcd`. diff --git a/man/match_name.Rd b/man/match_name.Rd index d78733c6..25d2ad7e 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -40,9 +40,10 @@ only \code{sector}, the value in the \code{name} column should be \code{NA} and vice-versa. This file can be used to manually match loanbook companies to abcd.} -\item{join_id}{A character string naming an ID column used to left join -\code{abcd} to the \code{loanbook} (e.g. "lei"). ID column must be present, as named, -in both input datasets.} +\item{join_id}{A join specification passe to \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}}. If a +character string, it assumes identical join columns between \code{loanbook} and +\code{abcd}. If a named list, uses the name as the join column of \code{loanbook} and +the value as the join column of \code{abcd}.} \item{ald}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{ald} has been superseded by \code{abcd}.} From b1f2044723bd70c29f8db3acb2260730be3284f0 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Mon, 11 Mar 2024 10:06:23 +0100 Subject: [PATCH 12/28] add example --- R/match_name.R | 19 +++++++++++++++++++ man/match_name.Rd | 19 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/R/match_name.R b/R/match_name.R index f617c8ee..86b09edd 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -87,6 +87,25 @@ #' code_system = "XYZ" #' ) #' +#' # match on LEI +#' loanbook <- tibble( +#' sector_classification_system = "XYZ", +#' sector_classification_direct_loantaker = "D35.11", +#' id_ultimate_parent = "UP15", +#' name_ultimate_parent = "Won't fuzzy match", +#' id_direct_loantaker = "C294", +#' name_direct_loantaker = "Won't fuzzy match", +#' lei_direct_loantaker = "LEI123" +#' ) +#' +#' abcd <- tibble( +#' name_company = "alpine knits india pvt. limited", +#' sector = "power", +#' lei = "LEI123" +#' ) +#' +#' match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei")) +#' #' restore <- options(r2dii.match.sector_classifications = your_classifications) #' #' loanbook <- tibble( diff --git a/man/match_name.Rd b/man/match_name.Rd index 25d2ad7e..a5aaf06a 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -130,6 +130,25 @@ your_classifications <- tibble( code_system = "XYZ" ) +# match on LEI +loanbook <- tibble( + sector_classification_system = "XYZ", + sector_classification_direct_loantaker = "D35.11", + id_ultimate_parent = "UP15", + name_ultimate_parent = "Won't fuzzy match", + id_direct_loantaker = "C294", + name_direct_loantaker = "Won't fuzzy match", + lei_direct_loantaker = "LEI123" +) + +abcd <- tibble( + name_company = "alpine knits india pvt. limited", + sector = "power", + lei = "LEI123" +) + +match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei")) + restore <- options(r2dii.match.sector_classifications = your_classifications) loanbook <- tibble( From c60ad21b4e815bc11cf58d7b16c0510c3ba32605 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 09:02:13 +0100 Subject: [PATCH 13/28] update test name --- tests/testthat/test-match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index e9417d8b..607299ef 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -724,7 +724,7 @@ test_that("allows custom `sector_classifications` via options() (#354)", { options(old) }) -test_that("`join_id` optionally joins as expected (#135)", { +test_that("with `join_id`, joins as expected (#135)", { loanbook <- fake_lbk( name_direct_loantaker = "DL won't fuzzy match", From acb0ef3929361edc52175db2196790b67d7de15d Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 09:15:31 +0100 Subject: [PATCH 14/28] add more complex tests --- tests/testthat/test-match_name.R | 60 +++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 607299ef..536e3925 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -742,9 +742,12 @@ test_that("with `join_id`, joins as expected (#135)", { # expect exactly one match here, based on input ID out_with_join_id <- match_name(loanbook, abcd, join_id = "id_col") expect_equal(nrow(out_with_join_id), 1L) + expect_contains(names(out_with_join_id), "id_col") + expect_equal(unique(out_with_join_id$id_col), "LEI123") + }) -test_that("with `join_id` accepts list input indicating different cols", { +test_that("with `join_id` accepts list input indicating different cols (#135)", { loanbook <- fake_lbk( name_direct_loantaker = "DL won't fuzzy match", @@ -763,6 +766,8 @@ test_that("with `join_id` accepts list input indicating different cols", { ) expect_equal(nrow(out_with_join_id), 1L) + expect_contains(names(out_with_join_id), "lei_direct_loantaker") + expect_equal(unique(out_with_join_id$lei_direct_loantaker), "LEI123") }) @@ -787,3 +792,56 @@ test_that("with `join_id`, outputs data with loanbook join column (#135)", { expect_contains(names(out), "foo_lbk") }) + +test_that("with `join_id` and multiple matches, prefers ID (#135)", { + + # loanbook might match at UP + loanbook <- fake_lbk( + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = "UP will fuzzy match", + lei_direct_loantaker = "LEI123" + ) + + abcd <- fake_abcd( + lei = "LEI123", + name_company = "UP will fuzzy match" + ) + + # with `join_id` as named list + out <- match_name( + loanbook, + abcd, + join_id = c(lei_direct_loantaker = "lei") + ) + + expect_equal(unique(out_with_join_id$lei_direct_loantaker), "LEI123") + +}) + +test_that("with `join_id` and one ID match, one fuzzy match, outputs as expected (#135)", { + + + loanbook <- fake_lbk( + id_loan = c("L1", "L2"), # L1 should ID match, L2 should fuzzy match + name_direct_loantaker = "DL won't fuzzy match", + name_ultimate_parent = c("UP won't fuzzy match", "UP will fuzzy match"), + lei_direct_loantaker = c("LEI123", NA_character_) + ) + + abcd <- fake_abcd( + name_company = c("a power company", "UP will fuzzy match"), + lei = c("LEI123", NA_character_) + ) + + out <- match_name( + loanbook, + abcd, + join_id = c(lei_direct_loantaker = "lei") + ) + + out <- split(out, out$id_loan) + + expect_equal(out$L1$lei_direct_loantaker, "LEI123") + expect_equal(out$L2$name, "UP will fuzzy match") + +}) From d930aa6051d9000a15df52734cbb0ceea440b21f Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 09:16:07 +0100 Subject: [PATCH 15/28] update test --- tests/testthat/test-match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 536e3925..c15922bd 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -814,7 +814,7 @@ test_that("with `join_id` and multiple matches, prefers ID (#135)", { join_id = c(lei_direct_loantaker = "lei") ) - expect_equal(unique(out_with_join_id$lei_direct_loantaker), "LEI123") + expect_equal(unique(out$lei_direct_loantaker), "LEI123") }) From 89716c63ee65e141234d2f83ac13c5e89b490a66 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 09:46:11 +0100 Subject: [PATCH 16/28] restructure abcd properly --- R/match_name.R | 31 ++++++++++++++++++++++--------- R/restructure_abcd.R | 25 +++++++------------------ tests/testthat/test-match_name.R | 2 +- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index 86b09edd..ae1648cd 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -149,16 +149,28 @@ match_name <- function(loanbook, abcd <- ald } - prep_abcd <- restructure_abcd(abcd, join_id) - if (!is.null(join_id)) { - check_join_id(join_id, loanbook, prep_abcd) + check_join_id(join_id, loanbook, abcd) - join_matched <- dplyr::inner_join( - loanbook, prep_abcd, by = join_id - ) + crucial_names <- c("name_company", "sector", join_id) + check_crucial_names(abcd, crucial_names) + + prep_abcd <- dplyr::transmute( + abcd, + name_abcd = .data[["name_company"]], + sector_abcd = tolower(.data[["sector"]]), + !!join_id := .data[[join_id]] + ) - join_matched <- dplyr::mutate(join_matched, score = 1) + prep_abcd <- dplyr::distinct(prep_abcd) + + join_matched <- dplyr::inner_join(loanbook, prep_abcd, by = join_id) + + join_matched <- dplyr::mutate( + join_matched, + score = 1, + source = "id joined" + ) join_by_list <- as_join_by(join_id) loanbook_join_id <- join_by_list[[1]] @@ -172,7 +184,7 @@ match_name <- function(loanbook, if (nrow(loanbook) != 0) { fuzzy_matched <- match_name_impl( loanbook = loanbook, - prep_abcd = prep_abcd, + abcd = abcd, by_sector = by_sector, min_score = min_score, method = method, @@ -201,7 +213,7 @@ match_name <- function(loanbook, } match_name_impl <- function(loanbook, - prep_abcd, + abcd, by_sector = TRUE, min_score = 0.8, method = "jw", @@ -217,6 +229,7 @@ match_name_impl <- function(loanbook, loanbook_rowid <- tibble::rowid_to_column(loanbook) prep_lbk <- restructure_loanbook(loanbook_rowid, overwrite = overwrite) + prep_abcd <- restructure_abcd(abcd) if (by_sector) { a <- expand_alias(prep_lbk, prep_abcd) diff --git a/R/restructure_abcd.R b/R/restructure_abcd.R index 095181d8..14a273b7 100644 --- a/R/restructure_abcd.R +++ b/R/restructure_abcd.R @@ -16,26 +16,15 @@ #' @examples #' restructure_abcd(r2dii.data::abcd_demo) #' @noRd -restructure_abcd <- function(data, join_id = NULL) { +restructure_abcd <- function(data) { crucial_names <- c("name_company", "sector") - if (!is.null(join_id)) { - check_crucial_names(data, c(join_id, crucial_names)) - - out <- dplyr::transmute( - data, - name = .data$name_company, - sector = tolower(.data$sector), - !!join_id := .data[[join_id]] - ) - } else { - check_crucial_names(data, crucial_names) + check_crucial_names(data, crucial_names) - out <- dplyr::transmute( - data, - name = .data$name_company, - sector = tolower(.data$sector) - ) - } + out <- dplyr::transmute( + data, + name = .data$name_company, + sector = tolower(.data$sector) + ) out <- distinct(out) add_alias(out) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index c15922bd..032485f9 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -842,6 +842,6 @@ test_that("with `join_id` and one ID match, one fuzzy match, outputs as expected out <- split(out, out$id_loan) expect_equal(out$L1$lei_direct_loantaker, "LEI123") - expect_equal(out$L2$name, "UP will fuzzy match") + expect_equal(out$L2$name_abcd, "UP will fuzzy match") }) From 4a815ba7bdbc6ca43f71104b9185eef3a7408fcd Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 10:59:18 +0100 Subject: [PATCH 17/28] test that new arg works with prioritize --- tests/testthat/test-prioritize.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/testthat/test-prioritize.R b/tests/testthat/test-prioritize.R index bd90d2c6..5186777a 100644 --- a/tests/testthat/test-prioritize.R +++ b/tests/testthat/test-prioritize.R @@ -235,3 +235,12 @@ test_that("with 0-row input returns 0-row input", { expect_no_error(prioritize(zero_row)) }) + +test_that("with `join_id`, outputs as expected", { + lbk <- fake_lbk(id_loan = "L1", id_col = "1") + abcd <- fake_abcd(id_col = "1") + matched <- match_name(lbk, abcd, join_id = "id_col") + + out <- prioritize(matched) + expect_equal(nrow(out), 1L) +}) From f16334d1d538cf3824d0330e0f76a84741f14ed6 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 10:59:24 +0100 Subject: [PATCH 18/28] solve failing test --- R/match_name.R | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index ae1648cd..aa5068d6 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -164,17 +164,27 @@ match_name <- function(loanbook, prep_abcd <- dplyr::distinct(prep_abcd) - join_matched <- dplyr::inner_join(loanbook, prep_abcd, by = join_id) - - join_matched <- dplyr::mutate( - join_matched, - score = 1, - source = "id joined" + prep_lbk <- may_add_sector_and_borderline(loanbook) + prep_lbk <- distinct(prep_lbk) + + join_matched <- dplyr::inner_join( + prep_lbk, + prep_abcd, + by = join_id, + na_matches = "never" ) join_by_list <- as_join_by(join_id) loanbook_join_id <- join_by_list[[1]] + join_matched <- dplyr::mutate( + join_matched, + score = 1, + source = "id joined", + level = loanbook_join_id, + name = .data[["name_abcd"]] + ) + loanbook <- dplyr::filter( loanbook, !.data[[loanbook_join_id]] %in% join_matched[[loanbook_join_id]] From e8d98d2afd088aaeca833e4c3ecfa8d2c663fec5 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Tue, 12 Mar 2024 11:09:34 +0100 Subject: [PATCH 19/28] skip tests if r2dii.data outdated --- tests/testthat/test-match_name.R | 11 +++++------ tests/testthat/test-prioritize.R | 3 ++- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 032485f9..59b11441 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -725,7 +725,7 @@ test_that("allows custom `sector_classifications` via options() (#354)", { }) test_that("with `join_id`, joins as expected (#135)", { - + skip_if_r2dii_data_outdated() loanbook <- fake_lbk( name_direct_loantaker = "DL won't fuzzy match", name_ultimate_parent = "UP won't fuzzy match", @@ -748,7 +748,7 @@ test_that("with `join_id`, joins as expected (#135)", { }) test_that("with `join_id` accepts list input indicating different cols (#135)", { - + skip_if_r2dii_data_outdated() loanbook <- fake_lbk( name_direct_loantaker = "DL won't fuzzy match", name_ultimate_parent = "UP won't fuzzy match", @@ -772,7 +772,7 @@ test_that("with `join_id` accepts list input indicating different cols (#135)", }) test_that("with `join_id`, outputs data with loanbook join column (#135)", { - + skip_if_r2dii_data_outdated() # with `join_id` as character out <- match_name( fake_lbk(foo = "1"), @@ -794,7 +794,7 @@ test_that("with `join_id`, outputs data with loanbook join column (#135)", { }) test_that("with `join_id` and multiple matches, prefers ID (#135)", { - + skip_if_r2dii_data_outdated() # loanbook might match at UP loanbook <- fake_lbk( name_direct_loantaker = "DL won't fuzzy match", @@ -819,8 +819,7 @@ test_that("with `join_id` and multiple matches, prefers ID (#135)", { }) test_that("with `join_id` and one ID match, one fuzzy match, outputs as expected (#135)", { - - + skip_if_r2dii_data_outdated() loanbook <- fake_lbk( id_loan = c("L1", "L2"), # L1 should ID match, L2 should fuzzy match name_direct_loantaker = "DL won't fuzzy match", diff --git a/tests/testthat/test-prioritize.R b/tests/testthat/test-prioritize.R index 5186777a..33a6d6f1 100644 --- a/tests/testthat/test-prioritize.R +++ b/tests/testthat/test-prioritize.R @@ -236,7 +236,8 @@ test_that("with 0-row input returns 0-row input", { expect_no_error(prioritize(zero_row)) }) -test_that("with `join_id`, outputs as expected", { +test_that("with `match_name` with `join_id`, outputs as expected (#135)", { + skip_if_r2dii_data_outdated() lbk <- fake_lbk(id_loan = "L1", id_col = "1") abcd <- fake_abcd(id_col = "1") matched <- match_name(lbk, abcd, join_id = "id_col") From 9814c6c03b064315b1010ef7239c5cb112a03908 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 11:17:20 +0100 Subject: [PATCH 20/28] fix incorrect error messages --- R/match_name.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index aa5068d6..8e47d6f9 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -444,14 +444,14 @@ check_join_id <- function(join_id, loanbook, abcd) { rlang::abort( "join_id_not_in_loanbook", message = glue( - "The join_id `{join_id}` must be present in both `loanbook` and `abcd`. It's not present in `loanbook`." + "The join_id `{join_id_list[[1]]}` must be present in `loanbook` input." ) ) } else if (!rlang::has_name(abcd, join_id_list[[2]])) { rlang::abort( "join_id_not_in_abcd", message = glue( - "The join_id `{join_id}` must be present in both `loanbook` and `abcd`. It's not present in `abcd`." + "The join_id `{join_id_list[[2]]}` must be present in `abcd` input." ) ) } From 08f989e84dc85d3ed5d57577d0fdc64ae4717c00 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 11:17:29 +0100 Subject: [PATCH 21/28] fix comment in test --- tests/testthat/test-match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 59b11441..15c1bf33 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -782,7 +782,7 @@ test_that("with `join_id`, outputs data with loanbook join column (#135)", { expect_contains(names(out), "foo") - # with `join_id` as named list + # with `join_id` as a named vector out <- match_name( fake_lbk(foo_lbk = "1"), fake_abcd(foo_abcd = "1"), From 704fd061abd0523d4dbd51be7c55cb0e7bb6a932 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 11:18:11 +0100 Subject: [PATCH 22/28] fix wording of test --- tests/testthat/test-match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 15c1bf33..79ea4812 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -807,7 +807,7 @@ test_that("with `join_id` and multiple matches, prefers ID (#135)", { name_company = "UP will fuzzy match" ) - # with `join_id` as named list + # with `join_id` as named vector out <- match_name( loanbook, abcd, From 79382c11b7e3081a1a1fefac53e9cac11a22df0a Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 11:20:10 +0100 Subject: [PATCH 23/28] Update R/match_name.R Co-authored-by: CJ Yetman --- R/match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/match_name.R b/R/match_name.R index 8e47d6f9..879939a2 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -31,7 +31,7 @@ #' only `sector`, the value in the `name` column should be `NA` and #' vice-versa. This file can be used to manually match loanbook companies to #' abcd. -#' @param join_id A join specification passe to [dplyr::inner_join()]. If a +#' @param join_id A join specification passed to [dplyr::inner_join()]. If a #' character string, it assumes identical join columns between `loanbook` and #' `abcd`. If a named list, uses the name as the join column of `loanbook` and #' the value as the join column of `abcd`. From 7b13092ade827f19b3d4b0562024c59252f6c3bf Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 11:20:26 +0100 Subject: [PATCH 24/28] document --- man/match_name.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/match_name.Rd b/man/match_name.Rd index a5aaf06a..76181339 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -40,7 +40,7 @@ only \code{sector}, the value in the \code{name} column should be \code{NA} and vice-versa. This file can be used to manually match loanbook companies to abcd.} -\item{join_id}{A join specification passe to \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}}. If a +\item{join_id}{A join specification passed to \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}}. If a character string, it assumes identical join columns between \code{loanbook} and \code{abcd}. If a named list, uses the name as the join column of \code{loanbook} and the value as the join column of \code{abcd}.} From 0fe36c62abeb9735e7ec3b0af2f00c9e96c44990 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 12:14:00 +0100 Subject: [PATCH 25/28] Update R/match_name.R Co-authored-by: Jacob Kastl <60064070+jacobvjk@users.noreply.github.com> --- R/match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/match_name.R b/R/match_name.R index 879939a2..6aac4078 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -33,7 +33,7 @@ #' abcd. #' @param join_id A join specification passed to [dplyr::inner_join()]. If a #' character string, it assumes identical join columns between `loanbook` and -#' `abcd`. If a named list, uses the name as the join column of `loanbook` and +#' `abcd`. If a named character vector, it uses the name as the join column of `loanbook` and #' the value as the join column of `abcd`. #' @param ... Arguments passed on to [stringdist::stringsim()]. #' @param ald `r lifecycle::badge('superseded')` `ald` has been superseded by From 8106fe77055be46378bd45794fbfd98c19b39645 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 12:14:08 +0100 Subject: [PATCH 26/28] Update tests/testthat/test-match_name.R Co-authored-by: Jacob Kastl <60064070+jacobvjk@users.noreply.github.com> --- tests/testthat/test-match_name.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-match_name.R b/tests/testthat/test-match_name.R index 79ea4812..136fb950 100644 --- a/tests/testthat/test-match_name.R +++ b/tests/testthat/test-match_name.R @@ -747,7 +747,7 @@ test_that("with `join_id`, joins as expected (#135)", { }) -test_that("with `join_id` accepts list input indicating different cols (#135)", { +test_that("with `join_id` accepts named character vector input indicating different cols (#135)", { skip_if_r2dii_data_outdated() loanbook <- fake_lbk( name_direct_loantaker = "DL won't fuzzy match", From 59e61966ba59c4a1920ccc512641271f686d3c4c Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 12:15:11 +0100 Subject: [PATCH 27/28] remove more mentions of list --- R/match_name.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/match_name.R b/R/match_name.R index 6aac4078..3a4f3c80 100644 --- a/R/match_name.R +++ b/R/match_name.R @@ -463,7 +463,7 @@ as_join_by <- function(x) { if (rlang::is_list(x)) { if (length(x) != 1L) { - rlang::abort("`join_id` must be a list of length 1.") + rlang::abort("`join_id` must be a vector of length 1.") } x_name <- names(x) %||% x y_name <- unname(x) @@ -474,7 +474,7 @@ as_join_by <- function(x) { # If x partially named, assume unnamed are the same in both tables x_name[x_name == ""] <- y_name[x_name == ""] } else { - rlang::abort("`by` must be a list or a character vector.") + rlang::abort("`by` must be a string or a character vector.") } if (!rlang::is_character(x_name)) { From afb28c67ba2b332229bfaa35d96fc5fd2d28bcf7 Mon Sep 17 00:00:00 2001 From: Jackson Hoffart Date: Thu, 14 Mar 2024 12:15:13 +0100 Subject: [PATCH 28/28] document --- man/match_name.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/match_name.Rd b/man/match_name.Rd index 76181339..a54945ee 100644 --- a/man/match_name.Rd +++ b/man/match_name.Rd @@ -42,7 +42,7 @@ abcd.} \item{join_id}{A join specification passed to \code{\link[dplyr:mutate-joins]{dplyr::inner_join()}}. If a character string, it assumes identical join columns between \code{loanbook} and -\code{abcd}. If a named list, uses the name as the join column of \code{loanbook} and +\code{abcd}. If a named character vector, it uses the name as the join column of \code{loanbook} and the value as the join column of \code{abcd}.} \item{ald}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} \code{ald} has been superseded by