Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle invalid multi-byte sequences in iconv encoding conversions #264

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions src/readstat.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,20 @@ typedef int (*readstat_value_label_handler)(const char *val_labels,
readstat_value_t value, const char *label, void *ctx);
typedef void (*readstat_error_handler)(const char *error_message, void *ctx);
typedef int (*readstat_progress_handler)(double progress, void *ctx);
typedef int (*readstat_invalid_string_handler)(char *dst, size_t dst_len,
const char *src, size_t src_len, int obs_index, readstat_variable_t *variable,
void *ctx);

int readstat_invalid_string_info(char *dst, size_t dst_len, const char *src, size_t src_len,
int obs_index, readstat_variable_t *variable, void *ctx);
int readstat_invalid_string_copy(char *dst, size_t dst_len, const char *src, size_t src_len,
int obs_index, readstat_variable_t *variable, void *ctx);
int readstat_invalid_string_skip(char *dst, size_t dst_len, const char *src, size_t src_len,
int obs_index, readstat_variable_t *variable, void *ctx);
int readstat_invalid_string_utf8(char *dst, size_t dst_len, const char *src, size_t src_len,
int obs_index, readstat_variable_t *variable, void *ctx);
int readstat_invalid_string_cp1252(char *dst, size_t dst_len, const char *src, size_t src_len,
int obs_index, readstat_variable_t *variable, void *ctx);

#if defined(_MSC_VER)
#include <BaseTsd.h>
Expand Down Expand Up @@ -334,14 +348,15 @@ typedef struct readstat_io_s {
} readstat_io_t;

typedef struct readstat_callbacks_s {
readstat_metadata_handler metadata;
readstat_note_handler note;
readstat_variable_handler variable;
readstat_fweight_handler fweight;
readstat_value_handler value;
readstat_value_label_handler value_label;
readstat_error_handler error;
readstat_progress_handler progress;
readstat_metadata_handler metadata;
readstat_note_handler note;
readstat_variable_handler variable;
readstat_fweight_handler fweight;
readstat_value_handler value;
readstat_value_label_handler value_label;
readstat_error_handler error;
readstat_progress_handler progress;
readstat_invalid_string_handler invalid_string;
} readstat_callbacks_t;

typedef struct readstat_parser_s {
Expand All @@ -365,6 +380,7 @@ readstat_error_t readstat_set_value_handler(readstat_parser_t *parser, readstat_
readstat_error_t readstat_set_value_label_handler(readstat_parser_t *parser, readstat_value_label_handler value_label_handler);
readstat_error_t readstat_set_error_handler(readstat_parser_t *parser, readstat_error_handler error_handler);
readstat_error_t readstat_set_progress_handler(readstat_parser_t *parser, readstat_progress_handler progress_handler);
readstat_error_t readstat_set_invalid_string_handler(readstat_parser_t *parser, readstat_invalid_string_handler invalid_string_handler);

readstat_error_t readstat_set_open_handler(readstat_parser_t *parser, readstat_open_handler open_handler);
readstat_error_t readstat_set_close_handler(readstat_parser_t *parser, readstat_close_handler close_handler);
Expand Down
111 changes: 109 additions & 2 deletions src/readstat_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, si
char *dst_end = dst;
size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left);
if (status == (size_t)-1) {
if (errno == E2BIG) {
if (errno == E2BIG) { /* E2BIG indicates that the output buffer is not large enough */
return READSTAT_ERROR_CONVERT_LONG_STRING;
} else if (errno == EILSEQ) {
} else if (errno == EILSEQ) { /* EILSEQ indicates an invalid multibyte sequence */
return READSTAT_ERROR_CONVERT_BAD_STRING;
} else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */
return READSTAT_ERROR_CONVERT;
Expand All @@ -34,3 +34,110 @@ readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, si
}
return READSTAT_OK;
}

int readstat_invalid_string_info(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* show information about the invalid string and exit */
printf("Invalid string in variable %s, row %d: \"%s\"\n", variable->name, obs_index, src);

return READSTAT_HANDLER_ABORT;
}

int readstat_invalid_string_copy(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* copy over the string unedited and continue */

/* strip off spaces from the input because the programs use ASCII space
* padding even with non-ASCII encoding. */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}

if (src_len + 1 > dst_len) {
return READSTAT_HANDLER_ABORT;
}

memcpy(dst, src, src_len);
dst[src_len] = '\0';

return READSTAT_HANDLER_OK;
}

int readstat_invalid_string_skip(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* skip the invalid string */
dst[0] = '\0';

return READSTAT_HANDLER_OK;
}

int readstat_invalid_string_utf8(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* treat string as utf-8 and use the unicode replacement character for any invalid bytes */

/* strip off spaces from the input because the programs use ASCII space
* padding even with non-ASCII encoding. */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}

iconv_t converter = iconv_open("UTF-8", "UTF-8");
if (converter == (iconv_t)-1) {
return READSTAT_HANDLER_ABORT;
}

size_t dst_left = dst_len - 1;
char *dst_end = dst;
size_t src_left = src_len;
const char *src_end = src;
while (src_left > 0) {
size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src_end, &src_left, &dst_end, &dst_left);
if (status == (size_t)-1) {
if (errno == E2BIG) { /* E2BIG indicates that the output buffer is not large enough */
return READSTAT_HANDLER_ABORT;
} else if (errno == EILSEQ) { /* EILSEQ indicates an invalid multibyte sequence */
if (dst_left < 3) {
return READSTAT_HANDLER_ABORT;
}

dst_end[0] = (char) 0xEF;
dst_end[1] = (char) 0xBF;
dst_end[2] = (char) 0xBD;
dst_end += 3;
src_end += 1;
dst_left -= 3;
src_left -= 1;
} else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */
return READSTAT_HANDLER_ABORT;
} else {
/* finish here and accept conversion if EINVAL is returned */
break;
}
}
}
dst[dst_len - dst_left - 1] = '\0';

iconv_close(converter);
return READSTAT_HANDLER_OK;
}

int readstat_invalid_string_cp1252(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* try converting the rest of the string as WINDOWS-1252, common encoding error */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}

iconv_t converter = iconv_open("UTF-8", "WINDOWS-1252");
if (converter == (iconv_t)-1) {
return READSTAT_HANDLER_ABORT;
}

size_t dst_left = dst_len - 1;
char *dst_end = dst;
size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left);

if (status == (size_t)-1) {
return READSTAT_HANDLER_ABORT;
}
dst[dst_len - dst_left - 1] = '\0';

iconv_close(converter);
return READSTAT_HANDLER_OK;
}

5 changes: 5 additions & 0 deletions src/readstat_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ readstat_error_t readstat_set_progress_handler(readstat_parser_t *parser, readst
return READSTAT_OK;
}

readstat_error_t readstat_set_invalid_string_handler(readstat_parser_t *parser, readstat_invalid_string_handler invalid_string_handler) {
parser->handlers.invalid_string = invalid_string_handler;
return READSTAT_OK;
}

readstat_error_t readstat_set_fweight_handler(readstat_parser_t *parser, readstat_fweight_handler fweight_handler) {
parser->handlers.fweight = fweight_handler;
return READSTAT_OK;
Expand Down
11 changes: 10 additions & 1 deletion src/sas/readstat_sas7bdat_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,16 @@ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable
if (col_info->type == READSTAT_TYPE_STRING) {
retval = readstat_convert(ctx->scratch_buffer, ctx->scratch_buffer_len,
col_data, col_info->width, ctx->converter);
if (retval != READSTAT_OK) {
if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) {
if (!ctx->handle.invalid_string) {
goto cleanup;
} else if (ctx->handle.invalid_string(ctx->scratch_buffer, ctx->scratch_buffer_len,
col_data, col_info->width, ctx->parsed_row_count+1,
variable, ctx->user_ctx) != READSTAT_HANDLER_OK) {
retval = READSTAT_ERROR_USER_ABORT;
goto cleanup;
}
} else if (retval != READSTAT_OK) {
if (ctx->handle.error) {
snprintf(ctx->error_buf, sizeof(ctx->error_buf),
"ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s",
Expand Down
12 changes: 11 additions & 1 deletion src/sas/readstat_xport_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -565,8 +565,18 @@ static readstat_error_t xport_process_row(xport_ctx_t *ctx, const char *row, siz
}
retval = readstat_convert(string, 4*variable->storage_width+1,
&row[pos], variable->storage_width, ctx->converter);
if (retval != READSTAT_OK)
if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) {
if (!ctx->handle.invalid_string) {
goto cleanup;
} else if (ctx->handle.invalid_string(string, 4*variable->storage_width+1,
&row[pos], variable->storage_width, ctx->parsed_row_count+1,
variable, ctx->user_ctx) != READSTAT_HANDLER_OK) {
retval = READSTAT_ERROR_USER_ABORT;
goto cleanup;
}
} else if (retval != READSTAT_OK) {
goto cleanup;
}

value.v.string_value = string;
} else {
Expand Down
11 changes: 10 additions & 1 deletion src/spss/readstat_por_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,16 @@ static readstat_error_t read_por_file_data(por_ctx_t *ctx) {
}
rs_retval = readstat_convert(output_string, sizeof(output_string),
input_string, strlen(input_string), ctx->converter);
if (rs_retval != READSTAT_OK) {
if (rs_retval == READSTAT_ERROR_CONVERT_BAD_STRING) {
if (!ctx->handle.invalid_string) {
goto cleanup;
} else if (ctx->handle.invalid_string(output_string, sizeof(output_string),
input_string, strlen(input_string), ctx->obs_count+1,
ctx->variables[i], ctx->user_ctx) != READSTAT_HANDLER_OK) {
rs_retval = READSTAT_ERROR_USER_ABORT;
goto cleanup;
}
} else if (rs_retval != READSTAT_OK) {
goto cleanup;
}
value.v.string_value = output_string;
Expand Down
12 changes: 11 additions & 1 deletion src/spss/readstat_sav_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,8 +726,18 @@ static readstat_error_t sav_process_row(unsigned char *buffer, size_t buffer_len
if (!ctx->variables[var_info->index]->skip) {
retval = readstat_convert(ctx->utf8_string, ctx->utf8_string_len,
ctx->raw_string, raw_str_used, ctx->converter);
if (retval != READSTAT_OK)
if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) {
if (!ctx->handle.invalid_string) {
goto done;
} else if (ctx->handle.invalid_string(ctx->utf8_string, ctx->utf8_string_len,
ctx->raw_string, raw_str_used, ctx->current_row+1,
ctx->variables[var_info->index], ctx->user_ctx) != READSTAT_HANDLER_OK) {
retval = READSTAT_ERROR_USER_ABORT;
goto done;
}
} else if (retval != READSTAT_OK) {
goto done;
}
value.v.string_value = ctx->utf8_string;
if (ctx->handle.value(ctx->current_row, ctx->variables[var_info->index],
value, ctx->user_ctx) != READSTAT_HANDLER_OK) {
Expand Down
12 changes: 11 additions & 1 deletion src/stata/readstat_dta_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -619,8 +619,18 @@ static readstat_error_t dta_handle_row(const unsigned char *buf, dta_ctx_t *ctx)
size_t str_len = strnlen((const char *)&buf[offset], max_len);
retval = readstat_convert(str_buf, sizeof(str_buf),
(const char *)&buf[offset], str_len, ctx->converter);
if (retval != READSTAT_OK)
if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) {
if (!ctx->handle.invalid_string) {
goto cleanup;
} else if (ctx->handle.invalid_string(str_buf, sizeof(str_buf),
(const char *)&buf[offset], str_len, ctx->current_row+1,
ctx->variables[j], ctx->user_ctx) != READSTAT_HANDLER_OK) {
retval = READSTAT_ERROR_USER_ABORT;
goto cleanup;
}
} else if (retval != READSTAT_OK) {
goto cleanup;
}
value.v.string_value = str_buf;
} else if (value.type == READSTAT_TYPE_STRING_REF) {
dta_strl_t key = dta_interpret_strl_vo_bytes(ctx, &buf[offset]);
Expand Down
11 changes: 10 additions & 1 deletion src/txt/readstat_txt_read.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,17 @@ static readstat_error_t handle_value(readstat_parser_t *parser, iconv_t converte
if (readstat_type_class(variable->type) == READSTAT_TYPE_CLASS_STRING) {
converted_value = malloc(4*len+1);
error = readstat_convert(converted_value, 4 * len + 1, bytes, len, converter);
if (error != READSTAT_OK)
if (error == READSTAT_ERROR_CONVERT_BAD_STRING) {
if (!parser->handlers.invalid_string) {
goto cleanup;
} else if (parser->handlers.invalid_string(converted_value, 4 * len + 1,
bytes, len, obs_index+1, variable, ctx) != READSTAT_HANDLER_OK) {
error = READSTAT_ERROR_USER_ABORT;
goto cleanup;
}
} else if (error != READSTAT_OK) {
goto cleanup;
}
value.v.string_value = converted_value;
} else {
char *endptr = NULL;
Expand Down