Skip to content

Commit

Permalink
feat(r): Support matrix objects as fixed-size-list arrays (#692)
Browse files Browse the repository at this point in the history
Still needs some testing on the stream case, and is unfortunately not
very zero copy; however, gets the job done (and I think fixes some cases
where we would have otherwise silently handled a matrix as the storage
type).

Inspired by #691!

``` r
library(nanoarrow)

df <- data.frame(x = 1:10)
df$matrix_col <- matrix(letters[1:20], ncol = 2, byrow = TRUE)

array <- as_nanoarrow_array(df)

# Default comes back as list_of(character())
convert_array(array) |> tibble::as_tibble()
#> # A tibble: 10 × 2
#>        x  matrix_col
#>    <int> <list<chr>>
#>  1     1         [2]
#>  2     2         [2]
#>  3     3         [2]
#>  4     4         [2]
#>  5     5         [2]
#>  6     6         [2]
#>  7     7         [2]
#>  8     8         [2]
#>  9     9         [2]
#> 10    10         [2]

# But can specify matrix
convert_array(
  array,
  tibble::tibble(x = integer(), matrix_col = matrix(character(), ncol = 2))
)
#> # A tibble: 10 × 2
#>        x matrix_col[,1] [,2] 
#>    <int> <chr>          <chr>
#>  1     1 a              b    
#>  2     2 c              d    
#>  3     3 e              f    
#>  4     4 g              h    
#>  5     5 i              j    
#>  6     6 k              l    
#>  7     7 m              n    
#>  8     8 o              p    
#>  9     9 q              r    
#> 10    10 s              t
```

<sup>Created on 2024-12-12 with [reprex
v2.1.1](https://reprex.tidyverse.org)</sup>
  • Loading branch information
paleolimbot authored Dec 17, 2024
1 parent 7b8a7c8 commit 1eaa8d5
Show file tree
Hide file tree
Showing 15 changed files with 329 additions and 28 deletions.
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ S3method(as_nanoarrow_array,difftime)
S3method(as_nanoarrow_array,factor)
S3method(as_nanoarrow_array,integer64)
S3method(as_nanoarrow_array,list)
S3method(as_nanoarrow_array,matrix)
S3method(as_nanoarrow_array,nanoarrow_array)
S3method(as_nanoarrow_array,nanoarrow_buffer)
S3method(as_nanoarrow_array,vctrs_unspecified)
Expand Down Expand Up @@ -100,6 +101,7 @@ S3method(infer_nanoarrow_schema,integer)
S3method(infer_nanoarrow_schema,integer64)
S3method(infer_nanoarrow_schema,list)
S3method(infer_nanoarrow_schema,logical)
S3method(infer_nanoarrow_schema,matrix)
S3method(infer_nanoarrow_schema,nanoarrow_array)
S3method(infer_nanoarrow_schema,nanoarrow_array_stream)
S3method(infer_nanoarrow_schema,nanoarrow_vctr)
Expand Down
36 changes: 36 additions & 0 deletions r/R/as-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,42 @@ as_nanoarrow_array.blob <- function(x, ..., schema = NULL) {
as_nanoarrow_array(unclass(x), schema = schema)
}

#' @export
as_nanoarrow_array.matrix <- function(x, ..., schema = NULL) {
if (is.null(schema)) {
schema <- infer_nanoarrow_schema(x)
} else {
schema <- as_nanoarrow_schema(schema)
}

expected_format <- paste0("+w:", ncol(x))
if (expected_format != schema$format) {
stop(
sprintf(
"Expected schema for matrix with fixed-size list of %d elements but got %s",
ncol(x),
nanoarrow_schema_formatted(schema)
)
)
}

# Raw unclass() doesn't work for matrix()
row_major_data <- t(x)
attributes(row_major_data) <- NULL

child_array <- as_nanoarrow_array(row_major_data, schema = schema$children[[1]])
array <- nanoarrow_array_init(schema)
nanoarrow_array_modify(
array,
list(
length = nrow(x),
null_count = 0,
buffers = list(NULL),
children = list(child_array)
)
)
}

#' @export
as_nanoarrow_array.data.frame <- function(x, ..., schema = NULL) {
# We need to override this to prevent the list implementation from handling it
Expand Down
2 changes: 2 additions & 0 deletions r/R/convert-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
#' be converted to [blob::blob()].
#' - [vctrs::list_of()]: List, large list, and fixed-size list types can be
#' converted to [vctrs::list_of()].
#' - [matrix()]: Fixed-size list types can be converted to
#' `matrix(ptype, ncol = fixed_size)`.
#' - [data.frame()]: Struct types can be converted to [data.frame()].
#' - [vctrs::unspecified()]: Any type can be converted to [vctrs::unspecified()];
#' however, a warning will be raised if any non-null values are encountered.
Expand Down
8 changes: 8 additions & 0 deletions r/R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ infer_nanoarrow_schema.vctrs_unspecified <- function(x, ...) {
na_na()
}

#' @export
infer_nanoarrow_schema.matrix <- function(x, ...) {
na_fixed_size_list(
infer_nanoarrow_schema(unclass(x[integer(0)])),
list_size = ncol(x)
)
}

#' @export
infer_nanoarrow_schema.vctrs_list_of <- function(x, ...) {
child_type <- infer_nanoarrow_schema(attr(x, "ptype"))
Expand Down
2 changes: 2 additions & 0 deletions r/man/convert_array.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 30 additions & 15 deletions r/src/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
static R_xlen_t nanoarrow_vec_size(SEXP vec_sexp, struct PTypeView* ptype_view) {
if (ptype_view->vector_type == VECTOR_TYPE_DATA_FRAME) {
return nanoarrow_data_frame_size(vec_sexp);
} else if (Rf_isMatrix(vec_sexp)) {
return Rf_nrows(vec_sexp);
} else {
return Rf_xlength(vec_sexp);
}
Expand Down Expand Up @@ -149,12 +151,7 @@ static void set_converter_data_frame(SEXP converter_xptr, struct RConverter* con
}

static void set_converter_list_of(SEXP converter_xptr, struct RConverter* converter,
SEXP ptype) {
SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
if (child_ptype == R_NilValue) {
Rf_error("Expected attribute 'ptype' for conversion to list_of");
}

SEXP ptype, SEXP child_ptype) {
converter->children = (struct RConverter**)ArrowMalloc(1 * sizeof(struct RConverter*));
if (converter->children == NULL) {
Rf_error("Failed to allocate converter children array");
Expand Down Expand Up @@ -230,15 +227,25 @@ SEXP nanoarrow_converter_from_ptype(SEXP ptype) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);

if (Rf_isObject(ptype)) {
if (Rf_isMatrix(ptype)) {
converter->ptype_view.vector_type = VECTOR_TYPE_MATRIX;
SEXP child_ptype = PROTECT(Rf_allocVector(TYPEOF(ptype), 0));
set_converter_list_of(converter_xptr, converter, ptype, child_ptype);
UNPROTECT(1);
} else if (Rf_isObject(ptype)) {
if (nanoarrow_ptype_is_data_frame(ptype)) {
converter->ptype_view.vector_type = VECTOR_TYPE_DATA_FRAME;
set_converter_data_frame(converter_xptr, converter, ptype);
} else if (Rf_inherits(ptype, "blob")) {
converter->ptype_view.vector_type = VECTOR_TYPE_BLOB;
} else if (Rf_inherits(ptype, "vctrs_list_of")) {
converter->ptype_view.vector_type = VECTOR_TYPE_LIST_OF;
set_converter_list_of(converter_xptr, converter, ptype);
SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
if (child_ptype == R_NilValue) {
Rf_error("Expected attribute 'ptype' for conversion to list_of");
}

set_converter_list_of(converter_xptr, converter, ptype, child_ptype);
} else if (Rf_inherits(ptype, "vctrs_unspecified")) {
converter->ptype_view.vector_type = VECTOR_TYPE_UNSPECIFIED;
} else if (Rf_inherits(ptype, "Date")) {
Expand Down Expand Up @@ -300,7 +307,8 @@ int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr) {
ArrowArrayViewInitFromSchema(&converter->array_view, schema, &converter->error));

if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME ||
converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
set_converter_children_schema(converter_xptr, schema_xptr);
}

Expand All @@ -318,7 +326,8 @@ int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr) {
converter->src.length = 0;

if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME ||
converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
set_converter_children_array(converter_xptr, array_xptr);
}

Expand All @@ -343,17 +352,23 @@ void sync_after_converter_reallocate(SEXP converter_xptr, struct RConverter* con
converter->children[i], VECTOR_ELT(result_sexp, i),
capacity);
}
} else if (converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
// Reserve for the child converter here, which ensures that a matrix column in
// a data.frame() will get allocated properly.
SEXP child_converters = VECTOR_ELT(converter_shelter, 3);
SEXP item_converter_xptr = VECTOR_ELT(child_converters, 0);
nanoarrow_converter_reserve(item_converter_xptr,
capacity * Rf_ncols(converter->ptype_view.ptype));
}
}

int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
void nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP current_result = VECTOR_ELT(converter_shelter, 4);

if (current_result != R_NilValue) {
ArrowErrorSet(&converter->error, "Reallocation in converter is not implemented");
return ENOTSUP;
Rf_error("Reallocation in converter is not implemented");
}

SEXP result_sexp;
Expand All @@ -368,7 +383,6 @@ int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
sync_after_converter_reallocate(converter_xptr, converter, result_sexp,
additional_size);
UNPROTECT(1);
return NANOARROW_OK;
}

R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
Expand Down Expand Up @@ -401,7 +415,7 @@ R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
int nanoarrow_converter_materialize_all(SEXP converter_xptr) {
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
R_xlen_t remaining = converter->array_view.array->length;
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, remaining));
nanoarrow_converter_reserve(converter_xptr, remaining);
if (nanoarrow_converter_materialize_n(converter_xptr, remaining) != remaining) {
return ERANGE;
} else {
Expand All @@ -415,6 +429,7 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) {
SEXP current_result = VECTOR_ELT(converter_shelter, 4);

NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(converter_xptr));
current_result = VECTOR_ELT(converter_shelter, 4);

// Check result size. A future implementation could also shrink the length
// or reallocate a shorter vector.
Expand Down
4 changes: 2 additions & 2 deletions r/src/convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr);

// Reserve space in the R vector output for additional elements. In theory
// this could be used to provide growable behaviour; however, this is not
// implemented. Returns an errno code.
int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);
// implemented.
void nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);

// Materialize the next n elements into the output. Returns the number of elements
// that were actually materialized which may be less than n.
Expand Down
4 changes: 1 addition & 3 deletions r/src/convert_array_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp,
nanoarrow_converter_stop(converter_xptr);
}

if (nanoarrow_converter_reserve(converter_xptr, size) != NANOARROW_OK) {
nanoarrow_converter_stop(converter_xptr);
}
nanoarrow_converter_reserve(converter_xptr, size);

int64_t n_batches = 0;
do {
Expand Down
88 changes: 83 additions & 5 deletions r/src/materialize.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,12 @@ int nanoarrow_ptype_is_nanoarrow_vctr(SEXP ptype) {
SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
SEXP result;

if (Rf_isObject(ptype)) {
if (Rf_isMatrix(ptype)) {
// The actual value is built in the child converter but we can't have
// a NULL here because that confuses the internals into thinking that
// the allocate was never called.
result = PROTECT(Rf_allocVector(TYPEOF(ptype), 0));
} else if (Rf_isObject(ptype)) {
// There may be a more accurate test that more precisely captures the case
// where a user has specified a valid ptype that doesn't work in a preallocate
// + fill conversion.
Expand Down Expand Up @@ -301,11 +306,12 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) {

int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
SEXP result = VECTOR_ELT(converter_shelter, 4);

// Materialize never called (e.g., empty stream)
if (result == R_NilValue) {
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0));
nanoarrow_converter_reserve(converter_xptr, 0);
result = VECTOR_ELT(converter_shelter, 4);
}

Expand Down Expand Up @@ -357,6 +363,22 @@ int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
SET_VECTOR_ELT(result, i, child_result);
UNPROTECT(1);
}
} else if (converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
SEXP item_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);
NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(item_converter_xptr));
SEXP item_result = PROTECT(nanoarrow_converter_release_result(item_converter_xptr));

SEXP matrix_symbol = PROTECT(Rf_install("matrix"));
SEXP nrow_sexp = PROTECT(
Rf_ScalarInteger(Rf_xlength(item_result) / converter->schema_view.fixed_size));
SEXP ncol_sexp = PROTECT(Rf_ScalarInteger(converter->schema_view.fixed_size));
SEXP byrow_sexp = PROTECT(Rf_ScalarLogical(TRUE));
SEXP matrix_call =
PROTECT(Rf_lang5(matrix_symbol, item_result, nrow_sexp, ncol_sexp, byrow_sexp));
SEXP final_result = PROTECT(Rf_eval(matrix_call, R_BaseNamespace));
SET_VECTOR_ELT(converter_shelter, 4, final_result);
UNPROTECT(7);
}

return NANOARROW_OK;
Expand Down Expand Up @@ -496,9 +518,7 @@ static int nanoarrow_materialize_data_frame(struct RConverter* converter,

static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr,
int64_t offset, int64_t length) {
if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) {
nanoarrow_converter_stop(converter_xptr);
}
nanoarrow_converter_reserve(converter_xptr, length);

converter->src.offset = offset;
converter->src.length = length;
Expand Down Expand Up @@ -581,6 +601,62 @@ static int nanoarrow_materialize_list_of(struct RConverter* converter,
return NANOARROW_OK;
}

static int nanoarrow_materialize_matrix(struct RConverter* converter,
SEXP converter_xptr) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
struct RConverter* child_converter = converter->children[0];
SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);

struct ArrayViewSlice* src = &converter->src;

// Make sure we error for dictionary types
if (src->array_view->array->dictionary != NULL) {
return EINVAL;
}

switch (src->array_view->storage_type) {
case NANOARROW_TYPE_FIXED_SIZE_LIST:
break;
default:
return EINVAL;
}

int64_t raw_src_offset = src->array_view->offset + src->offset;
int64_t list_length = src->array_view->layout.child_size_elements;
int64_t child_length = list_length * src->length;

if (list_length != Rf_ncols(converter->ptype_view.ptype)) {
Rf_error("Can't convert fixed_size_list(list_size=%d) to matrix with %d cols",
(int)list_length, Rf_ncols(converter->ptype_view.ptype));
}

// First, we update the child array offset to account for the parent offset and
// materialize the child array.
child_converter->src.offset += raw_src_offset * list_length;
child_converter->src.length = child_length;
if (nanoarrow_converter_materialize_n(child_converter_xptr, child_length) !=
child_length) {
return EINVAL;
}

// If we have parent nulls, we have to project them into the destination
if (src->array_view->null_count != 0 &&
src->array_view->buffer_views[0].data.data != NULL) {
// Here, dst.offset has already been incremented such that it's at the end
// of the chunk, but we need the original one for fill_vec_with_nulls().
int64_t original_dst_offset = child_converter->dst.offset - child_length;
for (int64_t i = 0; i < src->length; i++) {
if (ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
fill_vec_with_nulls(child_converter->dst.vec_sexp,
original_dst_offset + (i * list_length), list_length);
}
}
}

return NANOARROW_OK;
}

static int nanoarrow_materialize_base(struct RConverter* converter, SEXP converter_xptr) {
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
Expand Down Expand Up @@ -614,6 +690,8 @@ static int nanoarrow_materialize_base(struct RConverter* converter, SEXP convert
return nanoarrow_materialize_blob(src, dst, options);
case VECTOR_TYPE_LIST_OF:
return nanoarrow_materialize_list_of(converter, converter_xptr);
case VECTOR_TYPE_MATRIX:
return nanoarrow_materialize_matrix(converter, converter_xptr);
case VECTOR_TYPE_DATA_FRAME:
return nanoarrow_materialize_data_frame(converter, converter_xptr);
default:
Expand Down
1 change: 1 addition & 0 deletions r/src/materialize_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ enum VectorType {
VECTOR_TYPE_BLOB,
VECTOR_TYPE_LIST_OF,
VECTOR_TYPE_DATA_FRAME,
VECTOR_TYPE_MATRIX,
VECTOR_TYPE_OTHER
};

Expand Down
Loading

0 comments on commit 1eaa8d5

Please sign in to comment.