-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5a5697f
commit 4945725
Showing
1 changed file
with
66 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,60 +1,75 @@ | ||
R_SPLIT = function(domain_dataset,max_length_out = 200){ | ||
out_n = outt = NULL | ||
|
||
#filtering columns > 200 | ||
char_200 = domain_dataset %>% select_if(~ max(nchar(.)) >= max_length_out) | ||
|
||
#string split function | ||
split_var <- function(string,max_length_out = 200) { | ||
|
||
# Pattern spot | ||
pattern = names(which.max(table(str_extract_all(string, "[:punct:]|[:blank:]")))) %>% | ||
ifelse(is.null(.), "",.) | ||
|
||
# Split the input string into a vector | ||
split_vector <- unlist(stringr::str_split(string, pattern)) | ||
|
||
# Function to concatenate strings and split when length exceeds 200 | ||
split_when_needed <- function(result, item, sep, max_length_out) { | ||
current <- utils::tail(result, 1) | ||
if (nchar(paste0(current, sep, item)) <= (max_length_out - 1)) { | ||
result[length(result)] <- paste0(current, sep, item) | ||
} else { | ||
if (!identical(sep, " ")) result[length(result)] <- paste0(current, sep) | ||
result <- c(result, item) | ||
} | ||
result | ||
} | ||
|
||
# Use reduce to apply the function across the vector | ||
split_vector <- split_vector[-1] |> | ||
purrr::reduce(split_when_needed, .init = list(split_vector[1]), pattern, max_length_out) |> | ||
unlist() | ||
|
||
# Fix case where sentence do not exceed max_length_out | ||
last_two <- paste0(utils::tail(split_vector, n = 2), collapse = if (identical(pattern, " ")) " " else "") | ||
if (nchar(last_two) <= max_length_out) { | ||
split_vector <- c( | ||
utils::head(split_vector, n = -1), | ||
last_two | ||
) | ||
} | ||
return(as.list(split_vector)) | ||
#' Split a single string | ||
#' | ||
#' @param result A list to store the result | ||
#' @param item The item to be added to the result | ||
#' @param sep The separator used to split the string | ||
#' @param max_length_out The maximum length of the output string | ||
#' @return A list with the split strings | ||
single_str_spilt <- function(result, item, sep, max_length_out) { | ||
current <- tail(result, 1) | ||
if (nchar(paste0(current, sep, item)) <= (max_length_out - 1)) { | ||
result[length(result)] <- paste0(current, sep, item) | ||
} else { | ||
if (!identical(sep, " ")) result[length(result)] <- paste0(current, sep) | ||
result <- c(result, item) | ||
} | ||
return(result) | ||
} | ||
|
||
#' Split a variable | ||
#' | ||
#' @param string The string to be split | ||
#' @param max_length_out The maximum length of the output string (default is 200) | ||
#' @return A list with the split strings | ||
split_var <- function(string, max_length_out = 200) { | ||
# Pattern spot | ||
pattern <- names(which.max(table(stringr::str_extract_all(string, "[:punct:]|[:blank:]")))) %>% | ||
ifelse(is.null(.), "", .) | ||
|
||
# Split the input string into a vector | ||
split_vector <- unlist(stringr::str_split(string, pattern)) | ||
|
||
# Use reduce to apply the function across the vector | ||
split_vector <- split_vector[-1] %>% | ||
purrr::reduce(single_str_spilt, .init = list(split_vector[1]), pattern, max_length_out) %>% | ||
unlist() | ||
|
||
# Fix case where sentence do not exceed max_length_out | ||
last_two <- paste0(tail(split_vector, n = 2), collapse = if (identical(pattern, " ")) " " else "") | ||
if (nchar(last_two) <= max_length_out) { | ||
split_vector <- c( | ||
head(split_vector, n = -1), | ||
last_two | ||
) | ||
} | ||
return(as.list(split_vector)) | ||
} | ||
|
||
#' Split a dataset string | ||
#' | ||
#' @param domain_dataset The dataset to be split | ||
#' @param max_length_out The maximum length of output string (default is 200) | ||
#' @return A dataset with the split strings | ||
sdtm_str_split <- function(domain_dataset, max_length_out = 200) { | ||
out_n <- outt <- NULL | ||
|
||
#FUNCTION CALL | ||
outt <- map(char_200, ~ { | ||
split_list <- map(.x, ~ { | ||
# Filtering columns > 200 | ||
char_200 <- dplyr::select_if(domain_dataset, ~ max(nchar(.)) >= max_length_out) | ||
|
||
# FUNCTION CALL | ||
outt <- purrr::map(char_200, ~ { | ||
split_list <- purrr::map(.x, ~ { | ||
cv <- as.data.frame(split_var(.x, max_length_out)) | ||
names(cv) <- seq_along(cv) | ||
cv | ||
}) | ||
split_df <- bind_rows(split_list) | ||
split_df | ||
}) %>% imap(.,~set_names(.x,.y)) %>% bind_cols() | ||
split_df <- dplyr::bind_rows(split_list) | ||
return(split_df) | ||
}) %>% | ||
purrr::imap(., ~ set_names(.x, .y)) %>% | ||
dplyr::bind_cols() | ||
|
||
names(outt) = sub("....$","",names(outt)) %>% make.unique(., sep = "_") | ||
dataset_OUT = bind_cols(domain_dataset %>% select(-names(char_200)),outt) | ||
names(outt) <- sub("....$", "", names(outt)) %>% make.unique(., sep = "_") | ||
dataset_OUT <- dplyr::bind_cols(dplyr::select(domain_dataset, -names(char_200)), outt) | ||
return(dataset_OUT) | ||
} | ||
|