From 6b1a981aa4a6bfe3ed46d624e6c86c480b886ead Mon Sep 17 00:00:00 2001 From: Rafal Chojna Date: Mon, 2 Oct 2023 17:39:09 +0200 Subject: [PATCH] Implement a handler for file header. This allows clients to learn the file version, compression, writer version, etc. Before, this information was impossible to extract from the library. --- src/rdata.h | 27 +++++++++++++++++++++++++++ src/rdata_internal.h | 10 +++------- src/rdata_parser.c | 5 +++++ src/rdata_read.c | 38 ++++++++++++++++++++++++++++++++++---- src/rdata_write.c | 2 +- 5 files changed, 70 insertions(+), 12 deletions(-) diff --git a/src/rdata.h b/src/rdata.h index 2bf4243..c94aba6 100644 --- a/src/rdata.h +++ b/src/rdata.h @@ -42,6 +42,22 @@ typedef enum rdata_file_format_e { RDATA_SINGLE_OBJECT } rdata_file_format_t; +typedef enum rdata_compression_e { + RDATA_COMPRESSION_NONE, + RDATA_COMPRESSION_GZIP, + RDATA_COMPRESSION_BZIP2, + RDATA_COMPRESSION_LZMA +} rdata_compression_t; + +#pragma pack(push, 1) +typedef struct rdata_header_s { + char header[2]; + uint32_t format_version; + uint32_t writer_version; + uint32_t reader_version; +} rdata_header_t; +#pragma pack(pop) + const char *rdata_error_message(rdata_error_t error_code); typedef int (*rdata_column_handler)(const char *name, rdata_type_t type, @@ -52,6 +68,15 @@ typedef int (*rdata_column_name_handler)(const char *value, int index, void *ctx typedef void (*rdata_error_handler)(const char *error_message, void *ctx); typedef int (*rdata_progress_handler)(double progress, void *ctx); +/* Callback type for passing meta-information from file header to the caller. Arguments: + * - compression - compression algorithm used on the file, + * - header_line - first 5 bytes of the file containing information about file type (RData vs RDS, ascii vs binary, etc.), + * - header - structure with information about file version, writer R version and minimal R version required to read the file back, + * - ctx - user context, same as in all other handlers + * header_line and header must not be free'd by the user. + */ +typedef int (*rdata_header_handler)(rdata_compression_t compression, const char *header_line, const rdata_header_t* header, void *ctx); + #if defined(_MSC_VER) #include typedef SSIZE_T ssize_t; @@ -96,6 +121,7 @@ typedef struct rdata_parser_s { rdata_column_handler dim_handler; rdata_text_value_handler dim_name_handler; rdata_error_handler error_handler; + rdata_header_handler header_handler; rdata_io_t *io; } rdata_parser_t; @@ -111,6 +137,7 @@ rdata_error_t rdata_set_value_label_handler(rdata_parser_t *parser, rdata_text_v rdata_error_t rdata_set_dim_handler(rdata_parser_t *parser, rdata_column_handler dim_handler); rdata_error_t rdata_set_dim_name_handler(rdata_parser_t *parser, rdata_text_value_handler dim_name_handler); rdata_error_t rdata_set_error_handler(rdata_parser_t *parser, rdata_error_handler error_handler); +rdata_error_t rdata_set_header_handler(rdata_parser_t *parser, rdata_header_handler header_handler); rdata_error_t rdata_set_open_handler(rdata_parser_t *parser, rdata_open_handler open_handler); rdata_error_t rdata_set_close_handler(rdata_parser_t *parser, rdata_close_handler close_handler); rdata_error_t rdata_set_seek_handler(rdata_parser_t *parser, rdata_seek_handler seek_handler); diff --git a/src/rdata_internal.h b/src/rdata_internal.h index 98cd6a1..db17d60 100644 --- a/src/rdata_internal.h +++ b/src/rdata_internal.h @@ -6,13 +6,6 @@ #pragma pack(push, 1) -typedef struct rdata_v2_header_s { - char header[2]; - uint32_t format_version; - uint32_t writer_version; - uint32_t reader_version; -} rdata_v2_header_t; - typedef struct rdata_sexptype_header_s { unsigned int type:8; unsigned int object:1; @@ -78,3 +71,6 @@ typedef struct rdata_sexptype_info_s { #define RDATA_SEXPTYPE_LANGUAGE_OBJECT_ATTR 240 #define RDATA_SEXPTYPE_PAIRLIST_ATTR 239 #define RDATA_PSEUDO_SXP_ALTREP 238 + +/* we read this many characters from the beginning of the file to determine file format */ +#define RDATA_HEADER_LENGTH 5 \ No newline at end of file diff --git a/src/rdata_parser.c b/src/rdata_parser.c index 67f782e..27d8f31 100644 --- a/src/rdata_parser.c +++ b/src/rdata_parser.c @@ -63,6 +63,11 @@ rdata_error_t rdata_set_error_handler(rdata_parser_t *parser, rdata_error_handle return RDATA_OK; } +rdata_error_t rdata_set_header_handler(rdata_parser_t *parser, rdata_header_handler header_handler) { + parser->header_handler = header_handler; + return RDATA_OK; +} + rdata_error_t rdata_set_open_handler(rdata_parser_t *parser, rdata_open_handler open_handler) { parser->io->open = open_handler; return RDATA_OK; diff --git a/src/rdata_read.c b/src/rdata_read.c index 8624193..0458332 100644 --- a/src/rdata_read.c +++ b/src/rdata_read.c @@ -60,7 +60,8 @@ typedef struct rdata_ctx_s { rdata_text_value_handler value_label_handler; rdata_column_handler dim_handler; rdata_text_value_handler dim_name_handler; - rdata_error_handler error_handler; + rdata_error_handler error_handler; + rdata_header_handler header_handler; void *user_ctx; #if HAVE_BZIP2 bz_stream *bz_strm; @@ -656,9 +657,10 @@ void free_rdata_ctx(rdata_ctx_t *ctx) { rdata_error_t rdata_parse(rdata_parser_t *parser, const char *filename, void *user_ctx) { int is_rdata = 0; rdata_error_t retval = RDATA_OK; - rdata_v2_header_t v2_header; + rdata_header_t v2_header; rdata_ctx_t *ctx = rdata_ctx_init(parser->io, filename); char *encoding = NULL; + rdata_compression_t compression = RDATA_COMPRESSION_NONE; if (ctx == NULL) { retval = RDATA_ERROR_OPEN; @@ -675,6 +677,7 @@ rdata_error_t rdata_parse(rdata_parser_t *parser, const char *filename, void *us ctx->dim_handler = parser->dim_handler; ctx->dim_name_handler = parser->dim_name_handler; ctx->error_handler = parser->error_handler; + ctx->header_handler = parser->header_handler; ctx->is_dimnames = false; @@ -682,8 +685,8 @@ rdata_error_t rdata_parse(rdata_parser_t *parser, const char *filename, void *us goto cleanup; } - char header_line[5]; - if (read_st(ctx, &header_line, sizeof(header_line)) != sizeof(header_line)) { + char header_line[RDATA_HEADER_LENGTH] = ""; + if (read_st(ctx, &header_line, RDATA_HEADER_LENGTH) != RDATA_HEADER_LENGTH) { retval = RDATA_ERROR_READ; goto cleanup; } @@ -704,6 +707,33 @@ rdata_error_t rdata_parse(rdata_parser_t *parser, const char *filename, void *us v2_header.reader_version = byteswap4(v2_header.reader_version); } + if (ctx->header_handler) { +#if HAVE_BZIP2 + if (ctx->bz_strm) { + compression = RDATA_COMPRESSION_BZIP2; + } +#endif +#if HAVE_APPLE_COMPRESSION + if (ctx->compression_strm) { + compression = RDATA_COMPRESSION_LZMA; + } +#endif +#if HAVE_ZLIB + if (ctx->z_strm) { + compression = RDATA_COMPRESSION_GZIP; + } +#endif +#if HAVE_LZMA + if (ctx->lzma_strm) { + compression = RDATA_COMPRESSION_LZMA; + } +#endif + if(ctx->header_handler(compression, header_line, &v2_header, ctx->user_ctx)) { + retval = RDATA_ERROR_USER_ABORT; + goto cleanup; + } + } + if (is_rdata && v2_header.format_version != header_line[3] - '0') { retval = RDATA_ERROR_PARSE; goto cleanup; diff --git a/src/rdata_write.c b/src/rdata_write.c index 4e668f2..ac7945a 100644 --- a/src/rdata_write.c +++ b/src/rdata_write.c @@ -243,7 +243,7 @@ rdata_error_t rdata_begin_file(rdata_writer_t *writer, void *user_ctx) { goto cleanup; } - rdata_v2_header_t v2_header; + rdata_header_t v2_header; memcpy(v2_header.header, "X\n", sizeof("X\n")-1); v2_header.format_version = 2; v2_header.reader_version = 131840;