-
Notifications
You must be signed in to change notification settings - Fork 2.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add dplyr_sticky_cols() or similar #5290
Comments
I think the class should be able to specify front- or back-stickiness. For instance the grouping columns of grouped-df are front-sticky, whereas the geometry columns of sf are back-sticky. |
Would stickyness imply focus (#6252)? |
I came to make an issue similar to this based on Extending dplyr with new data frame subclasses. To have a sticky column the third bullet point states:
Based on this one would anticipate that simply implementing Based on my exploration this isn't true at the moment. It looks like The below reprex illustrates the current short comings. The idea is there is a data frame class called Creating the class# create some points
pnt_data <- lapply(1:5, \(x) runif(2, 0, 90))
# create new vector class
pnts <- vctrs::new_vctr(pnt_data, class = "points")
# function to calculate extent stored as an attribute
calculate_extent <- function(.x) {
stopifnot(inherits(.x, "points"))
x <- vapply(.x, `[`, numeric(1), 1)
y <- vapply(.x, `[`, numeric(1), 2)
c(xmin = min(x), xmax = max(x), ymin = min(y), ymax = max(y))
}
# helper to find point column
which_is_point_col <- function(x) which(vapply(x, inherits, logical(1), "points"))
# function to create a new point collection
new_point_collection <- function(x) {
point_col <- which_is_point_col(x)
tibble::new_tibble(
x,
class = "point_collection",
point_col = names(point_col)[1],
extent = calculate_extent(x[[point_col]])
)
}
# a special format method of course
format.point_collection <- function(x, ...) {
extent <- attr(x, "extent")
extent_str <- paste(names(extent), round(extent, 2), sep = ": ", collapse = " ")
c(
title = "Point Collection",
bbox = extent_str,
NextMethod()
)
} Implementing `dplyr_reconstruct` and `[`# create a new tibble super class
x <- new_point_collection(list(pnts = pnts))
# print w/ format
x
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
# implement a `[.point_collection` to maintain the `point_col` attribute
# retains points always
dplyr_reconstruct.point_collection <- function(data, template) {
res <- NextMethod()
new_point_collection(res)
}
# 1D selection method. I'm sure theres a way better way to do this but
# alas im a n00b
`[.point_collection` <- function(x, i, j, ...) {
if (missing(i) && missing(j)) return(x)
# extract points column
pnt_col_name <- attr(x, "point_col")
pnt_col <- x[[pnt_col_name]]
res <- NextMethod()
# x[i,]
if (!missing(i) && missing(j)) {
return(res)
}
# x[i]
if (missing(j)) {
res[[pnt_col_name]] <- pnt_col
return(res)
}
# x[i, j]
if (!missing(i) && !missing(j)) {
pnt_col <- pnt_col[i]
res[[pnt_col_name]] <- pnt_col
return(res)
}
#x[, j]
if (missing(i) && !missing(j)) {
res[[pnt_col_name]] <- pnt_col
return(res)
}
res
}
# Demonstrate selection
x[1]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
x[1,]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 1 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
x[,1]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
x[1,1]
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 1 × 1
#> pnts
#> <points>
#> 1 17.64270, 73.78937 Much of the promised dplyr functionality works out of the gate! But library(dplyr, warn.conflicts = FALSE)
# I expect the `pnts` to remain
select(x)
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 0
transmute(x, y = 1)
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> y
#> <dbl>
#> 1 1
#> 2 1
#> 3 1
#> 4 1
#> 5 1
# works as anticipated
select(x, pooooints = pnts)
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 6.51 ymax: 73.79
#> # A tibble: 5 × 1
#> pooooints
#> <points>
#> 1 17.64270, 73.78937
#> 2 82.43608, 39.68580
#> 3 39.72003, 25.56988
#> 4 40.48171, 39.16232
#> 5 59.128772, 6.510731
relocate(pnt_ids, id)
#> Error in relocate(pnt_ids, id): object 'pnt_ids' not found
slice(x, 2)
#> Point Collection
#> xmin: 82.44 xmax: 82.44 ymin: 39.69 ymax: 39.69
#> # A tibble: 1 × 1
#> pnts
#> <points>
#> 1 82.43608, 39.68580
pnt_ids <- mutate(x, id = row_number()) |>
arrange(-id)
inner_join(
pnt_ids,
tibble::tibble(id = 1:3, abc = letters[1:3])
)
#> Joining with `by = join_by(id)`
#> Point Collection
#> xmin: 17.64 xmax: 82.44 ymin: 25.57 ymax: 73.79
#> # A tibble: 3 × 3
#> pnts id abc
#> <points> <int> <chr>
#> 1 39.72003, 25.56988 3 c
#> 2 82.43608, 39.68580 2 b
#> 3 17.64270, 73.78937 1 a
# does not work (unclear why)
distinct(pnt_ids, id)
#> Error in `vectbl_as_col_location2()`:
#> ! Can't extract column with `point_col`.
#> ✖ Subscript `point_col` must be size 1, not 0.
#> Backtrace:
#> ▆
#> 1. ├─dplyr::distinct(pnt_ids, id)
#> 2. └─dplyr:::distinct.data.frame(pnt_ids, id)
#> 3. ├─dplyr::dplyr_row_slice(out, loc)
#> 4. └─dplyr:::dplyr_row_slice.data.frame(out, loc)
#> 5. └─dplyr::dplyr_reconstruct(vec_slice(data, i), data)
#> 6. ├─dplyr:::dplyr_reconstruct_dispatch(data, template)
#> 7. └─global dplyr_reconstruct.point_collection(data, template)
#> 8. └─global new_point_collection(res)
#> 9. ├─tibble::new_tibble(...)
#> 10. │ └─rlang::pairlist2(...)
#> 11. ├─global calculate_extent(x[[point_col]])
#> 12. │ └─base::stopifnot(inherits(.x, "points"))
#> 13. ├─x[[point_col]]
#> 14. └─tibble:::`[[.tbl_df`(x, point_col)
#> 15. └─tibble:::tbl_subset2(x, j = i, j_arg = substitute(i))
#> 16. └─tibble:::vectbl_as_col_location2(j, length(x), j_arg = j_arg)
#> 17. ├─tibble:::subclass_col_index_errors(...)
#> 18. │ └─base::withCallingHandlers(...)
#> 19. └─vctrs::vec_as_location2(j, n, names)
#> 20. └─vctrs:::result_get(...)
#> 21. └─rlang::cnd_signal(x$err) Created on 2023-02-24 with reprex v2.0.2 |
@JosiahParry I think you are misinterpreting the docs a little. We expect that In other words, What that means for you is that
If you want a user facing API that allows sticky columns, then we suggesting creating a So it becomes:
You can use library(dplyr)
gdf <- group_by(tibble(g = 1:2, x = c("a", "b")), g)
# grouped data frame
gdf
#> # A tibble: 2 × 2
#> # Groups: g [2]
#> g x
#> <int> <chr>
#> 1 1 a
#> 2 2 b
# retained grouping column, so still grouped
gdf["g"]
#> # A tibble: 2 × 1
#> # Groups: g [2]
#> g
#> <int>
#> 1 1
#> 2 2
# lost grouping column, so it is a tibble.
# notice we ONLY return the `x` column (a programming invariant we must follow)
gdf["x"]
#> # A tibble: 2 × 1
#> x
#> <chr>
#> 1 a
#> 2 b
# user friendly version that retains `g` no matter what
select(gdf, x)
#> Adding missing grouping variables: `g`
#> # A tibble: 2 × 2
#> # Groups: g [2]
#> g x
#> <int> <chr>
#> 1 1 a
#> 2 2 b Created on 2023-02-27 with reprex v2.0.2.9000 |
So that when you implement a tibble subclass with sticky columns, you don't need to add a new
select()
method. Likely to involve renamingensure_group_vars()
The text was updated successfully, but these errors were encountered: