Skip to content

Commit

Permalink
261 multiple improvements to get_code function and it's documentati…
Browse files Browse the repository at this point in the history
…on (#263)

~~Close #216~~

EDIT
Close #261

---------

Signed-off-by: Marcin <[email protected]>
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: 27856297+dependabot-preview[bot]@users.noreply.github.com <27856297+dependabot-preview[bot]@users.noreply.github.com>
Co-authored-by: go_gonzo <[email protected]>
Co-authored-by: Aleksander Chlebowski <[email protected]>
Co-authored-by: André Veríssimo <[email protected]>
Co-authored-by: Aleksander Chlebowski <[email protected]>
Co-authored-by: Dawid Kałędkowski <[email protected]>
  • Loading branch information
8 people authored Jan 24, 2024
1 parent 7c83774 commit ca761c2
Show file tree
Hide file tree
Showing 6 changed files with 409 additions and 245 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ Encoding: UTF-8
Language: en-US
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.0
RoxygenNote: 7.3.1
Collate:
'cdisc_data.R'
'data.R'
Expand Down
81 changes: 73 additions & 8 deletions R/teal_data-get_code.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,86 @@
#' Retrieve code from `teal_data` object.
#'
#' Retrieve code stored in `@code`, which (in principle) can be used to recreate all objects found in `@env`.
#' Use `datanames` to limit the code to one or more of the data sets enumerated in `@datanames`.
#' Use `datanames` to limit the code to one or more of the datasets enumerated in `@datanames`.
#' If the code has not passed verification (with [`verify()`]), a warning will be prepended.
#'
#' @section Notes for Developers:
#' To learn more about how a subset of code needed to reproduce a specific data set is extracted from all code,
#' see [`get_code_dependency()`].
#' @section Extracting dataset-specific code:
#' When `datanames` is specified, the code returned will be limited to the lines needed to _create_
#' the requested datasets. The code stored in the `@code` slot is analyzed statically to determine
#' which lines the datasets of interest depend upon. The analysis works well when objects are created
#' with standard infix assignment operators (see `?assignOps`) but it can fail in some situations.
#'
#' Consider the following examples:
#'
#' _Case 1: Usual assignments._
#' ```r
#' data <- teal_data() |>
#' within({
#' foo <- function(x) {
#' x + 1
#' }
#' x <- 0
#' y <- foo(x)
#' })
#' get_code(data, datanames = "y")
#' ```
#' `x` has no dependencies, so `get_code(data, datanames = "x")` will return only the second call.\cr
#' `y` depends on `x` and `foo`, so `get_code(data, datanames = "y")` will contain all three calls.
#'
#' _Case 2: Some objects are created by a function's side effects._
#' ```r
#' data <- teal_data() |>
#' within({
#' foo <- function() {
#' x <<- x + 1
#' }
#' x <- 0
#' foo()
#' y <- x
#' })
#' get_code(data, datanames = "y")
#' ```
#' Here, `y` depends on `x` but `x` is modified by `foo` as a side effect (not by reassignment)
#' and so `get_code(data, datanames = "y")` will not return the `foo()` call.\cr
#' To overcome this limitation, code dependencies can be specified manually.
#' Lines where side effects occur can be flagged by adding "`# @linksto <object name>`" at the end.\cr
#' Note that `within` evaluates code passed to `expr` as is and comments are ignored.
#' In order to include comments in code one must use the `eval_code` function instead.
#'
#' ```r
#' data <- teal_data() |>
#' eval_code("
#' foo <- function() {
#' x <<- x + 1
#' }
#' x <- 0
#' foo() # @linksto x
#' y <- x
#' ")
#' get_code(data, datanames = "y")
#' ```
#' Now the `foo()` call will be properly included in the code required to recreate `y`.
#'
#' Note that two functions that create objects as side effects, `assign` and `data`, are handled automatically.
#'
#' Here are known cases where manual tagging is necessary:
#' - non-standard assignment operators, _e.g._ `%<>%`
#' - objects used as conditions in `if` statements: `if (<condition>)`
#' - objects used to iterate over in `for` loops: `for(i in <sequence>)`
#' - creating and evaluating language objects, _e.g._ `eval(<call>)`
#'
#'
#' @param object (`teal_data`)
#' @param datanames `r lifecycle::badge("experimental")` (`character`) vector of data set names to return the code for.
#' @param datanames `r lifecycle::badge("experimental")` (`character`) vector of dataset names to return the code for.
#' For more details see the "Extracting dataset-specific code" section.
#' @param deparse (`logical`) flag specifying whether to return code as `character` (`deparse = TRUE`) or as
#' `expression` (`deparse = FALSE`).
#'
#' @return
#' Either string or an expression representing code used to create the requested data sets.
#' Either a character string or an expression. If `datanames` is used to request a specific dataset,
#' only code that _creates_ that dataset (not code that uses it) is returned. Otherwise, all contents of `@code`.
#'
#' @examples
#'
#' tdata1 <- teal_data()
#' tdata1 <- within(tdata1, {
#' a <- 1
Expand All @@ -33,8 +96,10 @@
#' tdata2 <- teal_data(x1 = iris, code = "x1 <- iris")
#' get_code(tdata2)
#' get_code(verify(tdata2))
#'
#' @rdname get_code
#' @aliases get_code,teal_data-method
#' @aliases get_code
#'
#' @export
setMethod("get_code", signature = "teal_data", definition = function(object, deparse = TRUE, datanames = NULL) {
checkmate::assert_character(datanames, min.len = 1L, null.ok = TRUE)
Expand Down
100 changes: 78 additions & 22 deletions R/utils-get_code_dependency.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,19 @@ find_call <- function(call_pd, text) {
#' @keywords internal
#' @noRd
extract_calls <- function(pd) {
calls <- lapply(pd[pd$parent == 0, "id"], get_children, pd = pd)
calls <- lapply(
pd[pd$parent == 0, "id"],
function(parent) {
rbind(
pd[pd$id == parent, c("token", "text", "id", "parent")],
get_children(pd = pd, parent = parent)
)
}
)
calls <- Filter(function(call) !(nrow(call) == 1 && call$token == "';'"), calls)
calls <- Filter(Negate(is.null), calls)
fix_comments(calls)
calls <- fix_shifted_comments(calls)
fix_arrows(calls)
}

#' @keywords internal
Expand All @@ -118,19 +128,43 @@ get_children <- function(pd, parent) {
}
}

#' Fixes edge case of comments being shifted to the next call.
#' @keywords internal
#' @noRd
fix_comments <- function(calls) {
# If the first token is a COMMENT, then it belongs to the previous call.
fix_shifted_comments <- function(calls) {
# If the first or the second token is a @linksto COMMENT,
# then it belongs to the previous call.
if (length(calls) >= 2) {
for (i in 2:length(calls)) {
if (grepl("@linksto", calls[[i]][1, "text"])) {
calls[[i - 1]] <- rbind(calls[[i - 1]], calls[[i]][1, ])
calls[[i]] <- calls[[i]][-1, ]
comment_idx <- grep("@linksto", calls[[i]][, "text"])
if (isTRUE(comment_idx[1] <= 2)) {
calls[[i - 1]] <- rbind(
calls[[i - 1]],
calls[[i]][seq_len(comment_idx[1]), ]
)
calls[[i]] <- calls[[i]][-seq_len(comment_idx[1]), ]
}
}
}
calls
Filter(nrow, calls)
}

#' Fixes edge case of `<-` assignment operator being called as function,
#' which is \code{`<-`(y,x)} instead of traditional `y <- x`.
#' @keywords internal
#' @noRd
fix_arrows <- function(calls) {
lapply(
calls,
function(call) {
call[call$token == "SYMBOL_FUNCTION_CALL" & call$text == "`<-`", c("token", "text")] <- c("LEFT_ASSIGN", "<-")
call[call$token == "SYMBOL_FUNCTION_CALL" & call$text == "`->`", c("token", "text")] <- c("RIGHT_ASSIGN", "->")
call[call$token == "SYMBOL_FUNCTION_CALL" & call$text == "`<<-`", c("token", "text")] <- c("LEFT_ASSIGN", "<-")
call[call$token == "SYMBOL_FUNCTION_CALL" & call$text == "`->>`", c("token", "text")] <- c("RIGHT_ASSIGN", "->")
call[call$token == "SYMBOL_FUNCTION_CALL" & call$text == "`=`", c("token", "text")] <- c("LEFT_ASSIGN", "<-")
call
}
)
}

# code_graph ----
Expand Down Expand Up @@ -199,29 +233,50 @@ extract_occurrence <- function(calls_pd) {
data_call <- find_call(call_pd, "data")
if (data_call) {
sym <- call_pd[data_call + 1, "text"]
return(c(gsub("^['\"]|['\"]$", "", sym), "<-", "data"))
return(c(gsub("^['\"]|['\"]$", "", sym), "<-"))
}
# Handle assign().
# Handle assign(x = ).
assign_call <- find_call(call_pd, "assign")
if (assign_call) {
# Check if parameters were named.
# "','" is for unnamed parameters, where "SYMBOL_SUB" is for named.
# "EQ_SUB" is for `=` appearing after the name of the named parameter.
if (any(call_pd$token == "SYMBOL_SUB")) {
params <- call_pd[call_pd$token == "SYMBOL_SUB", "text"]
pos <- match("x", params, nomatch = length(params) + 1L)
params <- call_pd[call_pd$token %in% c("SYMBOL_SUB", "','", "EQ_SUB"), "text"]
# Remove sequence of "=", ",".
if (length(params > 1)) {
remove <- integer(0)
for (i in 2:length(params)) {
if (params[i - 1] == "=" & params[i] == ",") {
remove <- c(remove, i - 1, i)
}
}
if (length(remove)) params <- params[-remove]
}
pos <- match("x", setdiff(params, ","), nomatch = match(",", params, nomatch = 0))
if (!pos) {
return(character(0L))
}
# pos is indicator of the place of 'x'
# 1. All parameters are named, but none is 'x' - return(character(0L))
# 2. Some parameters are named, 'x' is in named parameters: match("x", setdiff(params, ","))
# - check "x" in params being just a vector of named parameters.
# 3. Some parameters are named, 'x' is not in named parameters
# - check first appearance of "," (unnamed parameter) in vector parameters.
} else {
# Object is the first entry after 'assign'.
pos <- 1
}
sym <- call_pd[assign_call + pos, "text"]
return(c(gsub("^['\"]|['\"]$", "", sym), "<-", "assign"))
return(c(gsub("^['\"]|['\"]$", "", sym), "<-"))
}

# What occurs in a function body is not tracked.
x <- call_pd[!is_in_function(call_pd), ]
sym_cond <- which(x$token %in% c("SYMBOL", "SYMBOL_FUNCTION_CALL"))

if (length(sym_cond) == 0) {
return(character(0))
return(character(0L))
}
# Watch out for SYMBOLS after $ and @. For x$a x@a: x is object, a is not.
# For x$a, a's ID is $'s ID-2 so we need to remove all IDs that have ID = $ID - 2.
Expand All @@ -232,20 +287,21 @@ extract_occurrence <- function(calls_pd) {
sym_cond <- setdiff(sym_cond, which(x$id %in% after_dollar))
}

# If there was an assignment operation detect direction of it.
ass_cond <- grep("ASSIGN", x$token)
if (length(ass_cond)) { # NOTE 1
sym_cond <- sym_cond[sym_cond > ass_cond] # NOTE 2
if (!length(ass_cond)) {
return(c("<-", unique(x[sym_cond, "text"])))
}
if ((length(ass_cond) && x$text[ass_cond] == "->") || !length(ass_cond)) { # NOTE 3

sym_cond <- sym_cond[sym_cond > ass_cond] # NOTE 1
# If there was an assignment operation detect direction of it.
if (unique(x$text[ass_cond]) == "->") { # NOTE 2
sym_cond <- rev(sym_cond)
}

append(unique(x[sym_cond, "text"]), "<-", after = 1)

### NOTE 3: What if there are 2+ assignments, e.g. a <- b -> c or e.g. a <- b <- c.
### NOTE 2: For cases like 'eval(expression(b <- b + 2))' removes 'eval(expression('.
### NOTE 1: Cases like 'data(iris)' that do not have an assignment operator.
### NOTE 1: Then they are parsed as c("iris", "<-", "data")
### NOTE 2: What if there are 2 assignments: e.g. a <- b -> c.
### NOTE 1: For cases like 'eval(expression(b <- b + 2))' removes 'eval(expression('.
}
)
}
Expand Down
50 changes: 0 additions & 50 deletions man/get_code-teal_data-method.Rd

This file was deleted.

Loading

0 comments on commit ca761c2

Please sign in to comment.