Skip to content

Commit

Permalink
migrate parse_url, parse_query_string and build_url to cpp fo…
Browse files Browse the repository at this point in the history
…r performance improvement
  • Loading branch information
DyfanJones committed Jan 4, 2025
1 parent d8ce162 commit bb5e1ec
Show file tree
Hide file tree
Showing 11 changed files with 555 additions and 189 deletions.
1 change: 1 addition & 0 deletions paws.common/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* migrate backend `httr` to `httr2`
* new `PawsStreamHandler`, allows paws to handle aws stream event (#842). Thankyou to @hadley for developing the initial solution in `httr2`.
* deprecated custom handler for `s3_unmarshal_select_object_content`
* migrate `parse_url`, `parse_query_string` and `build_url` to `cpp` for performance improvement.

# paws.common 0.7.7
* fix unix time expiration check
Expand Down
16 changes: 16 additions & 0 deletions paws.common/R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ paws_url_encoder <- function(urls, safe = "") {
.Call('_paws_common_paws_url_encoder', PACKAGE = 'paws.common', urls, safe)
}

paws_url_unencoder <- function(urls) {
.Call('_paws_common_paws_url_unencoder', PACKAGE = 'paws.common', urls)
}

scan_ini_file <- function(filename) {
.Call('_paws_common_scan_ini_file', PACKAGE = 'paws.common', filename)
}
Expand Down Expand Up @@ -39,6 +43,18 @@ get_region_pattern <- function(region_pattern, region) {
.Call('_paws_common_get_region_pattern', PACKAGE = 'paws.common', region_pattern, region)
}

parse_query_string <- function(query) {
.Call('_paws_common_parse_query_string', PACKAGE = 'paws.common', query)
}

parse_url <- function(url) {
.Call('_paws_common_parse_url', PACKAGE = 'paws.common', url)
}

build_url <- function(url_components) {
.Call('_paws_common_build_url', PACKAGE = 'paws.common', url_components)
}

#' @useDynLib paws.common _paws_common_char_sort
#' @importFrom Rcpp evalCpp
char_sort <- function(str) {
Expand Down
19 changes: 8 additions & 11 deletions paws.common/R/custom_s3.R
Original file line number Diff line number Diff line change
Expand Up @@ -438,21 +438,18 @@ s3_get_bucket_region <- function(request, error, bucket) {
set_request_url <- function(original_endpoint,
new_endpoint,
use_new_scheme = TRUE) {
new_endpoint_components <- paws_url_parse(new_endpoint)
original_endpoint_components <- paws_url_parse(original_endpoint)
scheme <- original_endpoint_components$scheme
new_endpoint_components <- parse_url(new_endpoint)
final_endpoint_components <- parse_url(original_endpoint)
scheme <- final_endpoint_components$scheme
if (use_new_scheme) {
scheme <- new_endpoint_components$scheme
}
final_endpoint_components <- list(
scheme = scheme,
hostname = new_endpoint_components$hostname %||% "",
path = original_endpoint_components$path %||% "",
query = original_endpoint_components$query %||% "",
fragment = "",
raw_path = "",
raw_query = ""
path <- (
if (final_endpoint_components[["path"]] == "/") "" else final_endpoint_components[["path"]]
)
final_endpoint_components[["host"]] <- new_endpoint_components$host
final_endpoint_components[["scheme"]] <- scheme
final_endpoint_components[["path"]] <- path
final_endpoint <- build_url(final_endpoint_components)
return(final_endpoint)
}
Expand Down
135 changes: 11 additions & 124 deletions paws.common/R/url.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Url <- struct(
scheme = "",
opaque = "",
user = "",
password = "",
host = "",
path = "",
raw_path = "",
Expand All @@ -16,84 +17,6 @@ Url <- struct(
fragment = ""
)

# Parse a URL into a Url object.
# TODO: Finish.
parse_url <- function(url) {
p <- paws_url_parse(url)
if (is.null(p$hostname)) p$hostname <- ""
if (!is.null(p$port)) p$hostname <- paste0(p$hostname, ":", p$port)
raw_path <- p$path
if (is.null(raw_path)) {
raw_path <- "/"
} else if (substr(raw_path, 1, 1) != "/") raw_path <- paste0("/", raw_path)
path <- unescape(raw_path)
escaped_path <- escape(raw_path, "encodePath")
if (escaped_path == raw_path) raw_path <- ""
u <- Url(
scheme = p$scheme %||% "",
host = p$hostname,
path = path,
raw_path = raw_path,
raw_query = build_query_string(p$query),
)
return(u)
}

# Developed from httr2:
# https://github.com/r-lib/httr2/blob/main/R/url.R#L26-L67
paws_url_parse <- function(url) {
pieces <- str_match(url, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?")
scheme <- pieces[[2]]
authority <- pieces[[4]]
path <- pieces[[5]]
query <- pieces[[7]]
if (!is.null(query)) {
query <- parse_query_string(query)
}
fragment <- pieces[[9]]
pieces <- str_match(authority %||% "", "^(([^@]+)@)?([^:]+)?(:([^#]+))?")
userinfo <- pieces[[2]]
if (!is.null(userinfo)) {
if (grepl(":", userinfo)) {
userinfo <- parse_in_half(userinfo, ":")
} else {
userinfo <- list(userinfo, NULL)
}
}
hostname <- pieces[[3]]
port <- pieces[[5]]
return(
list(
scheme = scheme,
hostname = hostname,
port = port,
path = path,
query = query,
fragment = fragment,
username = userinfo[[1]],
password = userinfo[[2]]
)
)
}

# Build a URL from a Url object.
# <scheme>://<net_loc>/<path>;<params>?<query>#<fragment>
build_url <- function(url) {
if (nzchar(url$scheme) && nzchar(url$host)) {
l <- paste0(url$scheme, "://", url$host)
} else {
return("")
}
prefix <- function(prefix, x) {
if (nzchar(x)) paste0(prefix, x)
}
l <- paste0(
l, if (nzchar(url$raw_path)) url$raw_path else url$path,
prefix("?", url$raw_query), prefix("#", url$fragment)
)
return(l)
}

# helper function to filter out empty elements within build_query_string
query_empty <- function(params) {
(is.null(params) || length(params) == 0)
Expand Down Expand Up @@ -126,19 +49,6 @@ build_query_string <- function(params) {
return(paste(params[char_sort(param_names)], collapse = "&"))
}

# Decode a query string into a list.
# e.g. `parse_query_string("bar=baz&foo=qux")` -> `list(bar = "baz", foo = "qux")`
parse_query_string <- function(query) {
query <- gsub("^\\?", "", query)
params <- parse_in_half(strsplit(query, "&")[[1]], "=")
if (length(params) == 0) {
return(list())
}
out <- as.list(curl::curl_unescape(params[, 2]))
names(out) <- curl::curl_unescape(params[, 1])
return(out)
}

# Add the key/value pairs in `params` to a query string in `query_string`,
# and return a new query string. Keys in the query string that are also in
# params will be overwritten with the new value from params.
Expand All @@ -151,39 +61,16 @@ update_query_string <- function(query_string, params) {

# Escape strings so they can be safely included in a URL.
escape <- function(string, mode) {
# Ensure anything going to paws_url_encoder is a string
string <- as.character(string)
# base characters that won't be encoded
if (mode == "encodeHost" || mode == "encodeZone") {
# host and zone characters that won't be encoded
host_zone_pattern <- "][!$&'()*+,;=:<>\""
return(
paws_url_encoder(string, host_zone_pattern)
)
}
# path and path segment characters that won't be encoded
path_pattern <- "$&+,/;:=?@"

if (mode == "encodePath") {
# remove character ? from pattern so that it can be encoded
rm_pattern <- "[?]"
pattern <- gsub(rm_pattern, "", path_pattern)
return(paws_url_encoder(string, pattern))
}
if (mode == "encodePathSegment") {
# remove character /;,? from pattern so that it can be encoded
rm_pattern <- "[/;,?]"
pattern <- gsub(rm_pattern, "", path_pattern)
return(paws_url_encoder(string, pattern))
}
if (mode == "encodeQueryComponent") {
# escape string using base_url_encode
return(paws_url_encoder(string))
}
if (mode == "encodeFragment") {
return(paws_url_encoder(string, path_pattern))
}
return(paws_url_encoder(string))
safe_pattern <- switch(mode,
"encodeHost" = "][!$&'()*+,;=:<>\"",
"encodeZone" = "][!$&'()*+,;=:<>\"",
"encodeFragment" = "$&+,/;:=?@",
"encodePath" = "$&+,/;:=@",
"encodePathSegment" = "$&+:=@",
"encodeQueryComponent" = "",
""
)
return(paws_url_encoder(as.character(string), safe_pattern))
}

# Un-escape a string.
Expand Down
48 changes: 48 additions & 0 deletions paws.common/src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,17 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// paws_url_unencoder
CharacterVector paws_url_unencoder(CharacterVector urls);
RcppExport SEXP _paws_common_paws_url_unencoder(SEXP urlsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< CharacterVector >::type urls(urlsSEXP);
rcpp_result_gen = Rcpp::wrap(paws_url_unencoder(urls));
return rcpp_result_gen;
END_RCPP
}
// scan_ini_file
std::vector<std::string> scan_ini_file(const std::string& filename);
RcppExport SEXP _paws_common_scan_ini_file(SEXP filenameSEXP) {
Expand Down Expand Up @@ -90,6 +101,39 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// parse_query_string
List parse_query_string(std::string query);
RcppExport SEXP _paws_common_parse_query_string(SEXP querySEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type query(querySEXP);
rcpp_result_gen = Rcpp::wrap(parse_query_string(query));
return rcpp_result_gen;
END_RCPP
}
// parse_url
Rcpp::List parse_url(const std::string& url);
RcppExport SEXP _paws_common_parse_url(SEXP urlSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const std::string& >::type url(urlSEXP);
rcpp_result_gen = Rcpp::wrap(parse_url(url));
return rcpp_result_gen;
END_RCPP
}
// build_url
std::string build_url(const Rcpp::List& url_components);
RcppExport SEXP _paws_common_build_url(SEXP url_componentsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const Rcpp::List& >::type url_components(url_componentsSEXP);
rcpp_result_gen = Rcpp::wrap(build_url(url_components));
return rcpp_result_gen;
END_RCPP
}
// char_sort
CharacterVector char_sort(CharacterVector str);
RcppExport SEXP _paws_common_char_sort(SEXP strSEXP) {
Expand All @@ -115,12 +159,16 @@ END_RCPP

static const R_CallMethodDef CallEntries[] = {
{"_paws_common_paws_url_encoder", (DL_FUNC) &_paws_common_paws_url_encoder, 2},
{"_paws_common_paws_url_unencoder", (DL_FUNC) &_paws_common_paws_url_unencoder, 1},
{"_paws_common_scan_ini_file", (DL_FUNC) &_paws_common_scan_ini_file, 1},
{"_paws_common_process_profile_name", (DL_FUNC) &_paws_common_process_profile_name, 1},
{"_paws_common_json_convert_string", (DL_FUNC) &_paws_common_json_convert_string, 1},
{"_paws_common_check_global", (DL_FUNC) &_paws_common_check_global, 1},
{"_paws_common_endpoint_unescape", (DL_FUNC) &_paws_common_endpoint_unescape, 2},
{"_paws_common_get_region_pattern", (DL_FUNC) &_paws_common_get_region_pattern, 2},
{"_paws_common_parse_query_string", (DL_FUNC) &_paws_common_parse_query_string, 1},
{"_paws_common_parse_url", (DL_FUNC) &_paws_common_parse_url, 1},
{"_paws_common_build_url", (DL_FUNC) &_paws_common_build_url, 1},
{"_paws_common_char_sort", (DL_FUNC) &_paws_common_char_sort, 1},
{"_paws_common_uuid_v4", (DL_FUNC) &_paws_common_uuid_v4, 1},
{NULL, NULL, 0}
Expand Down
Loading

0 comments on commit bb5e1ec

Please sign in to comment.