From 6f500cba7eced21202161acc91bfcd044fc95a21 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Mon, 24 Jun 2024 17:06:35 +0200 Subject: [PATCH] Change parser to full-ragel --- src/spss/readstat_sav_parse_mr_name.c | 178 ++++++++++++++++++++++++- src/spss/readstat_sav_parse_mr_name.h | 2 +- src/spss/readstat_sav_parse_mr_name.rl | 52 +++++++- src/spss/readstat_sav_read.c | 23 +--- 4 files changed, 225 insertions(+), 30 deletions(-) diff --git a/src/spss/readstat_sav_parse_mr_name.c b/src/spss/readstat_sav_parse_mr_name.c index f105a28..3fb8318 100644 --- a/src/spss/readstat_sav_parse_mr_name.c +++ b/src/spss/readstat_sav_parse_mr_name.c @@ -283,9 +283,183 @@ readstat_error_t extract_mr_data(const char *line, mr_set_t *result) { readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { - readstat_error_t retval = READSTAT_OK; *result = (mr_set_t){0}; + return extract_mr_data(line, result); +} + + +#line 292 "./src/spss/readstat_sav_parse_mr_name.c" +static const char _mr_parser_actions[] = { + 0, 1, 0 +}; + +static const char _mr_parser_key_offsets[] = { + 0, 0, 1, 2, 4 +}; + +static const char _mr_parser_trans_keys[] = { + 36, 10, 0, 10, 10, 0 +}; + +static const char _mr_parser_single_lengths[] = { + 0, 1, 1, 2, 1 +}; + +static const char _mr_parser_range_lengths[] = { + 0, 0, 0, 0, 0 +}; + +static const char _mr_parser_index_offsets[] = { + 0, 0, 2, 4, 7 +}; + +static const char _mr_parser_indicies[] = { + 0, 1, 2, 0, 3, 2, 0, 2, + 0, 0 +}; + +static const char _mr_parser_trans_targs[] = { + 2, 0, 3, 4 +}; + +static const char _mr_parser_trans_actions[] = { + 0, 0, 1, 0 +}; + +static const int mr_parser_start = 1; + +static const int mr_parser_en_main = 1; + + +#line 157 "./src/spss/readstat_sav_parse_mr_name.rl" + + +readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines) { + readstat_error_t retval = READSTAT_OK; + int cs = 0; + char *p = (char *)line; + char *start = p; + char *pe = p + strlen(p) + 1; + *mr_sets = NULL; + *n_mr_lines = 0; + + +#line 348 "./src/spss/readstat_sav_parse_mr_name.c" + { + cs = mr_parser_start; + } + +#line 169 "./src/spss/readstat_sav_parse_mr_name.rl" + +#line 355 "./src/spss/readstat_sav_parse_mr_name.c" + { + int _klen; + unsigned int _trans; + const char *_acts; + unsigned int _nacts; + const char *_keys; + + if ( p == pe ) + goto _test_eof; + if ( cs == 0 ) + goto _out; +_resume: + _keys = _mr_parser_trans_keys + _mr_parser_key_offsets[cs]; + _trans = _mr_parser_index_offsets[cs]; + + _klen = _mr_parser_single_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + _klen - 1; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + ((_upper-_lower) >> 1); + if ( (*p) < *_mid ) + _upper = _mid - 1; + else if ( (*p) > *_mid ) + _lower = _mid + 1; + else { + _trans += (unsigned int)(_mid - _keys); + goto _match; + } + } + _keys += _klen; + _trans += _klen; + } + + _klen = _mr_parser_range_lengths[cs]; + if ( _klen > 0 ) { + const char *_lower = _keys; + const char *_mid; + const char *_upper = _keys + (_klen<<1) - 2; + while (1) { + if ( _upper < _lower ) + break; + + _mid = _lower + (((_upper-_lower) >> 1) & ~1); + if ( (*p) < _mid[0] ) + _upper = _mid - 2; + else if ( (*p) > _mid[1] ) + _lower = _mid + 2; + else { + _trans += (unsigned int)((_mid - _keys)>>1); + goto _match; + } + } + _trans += _klen; + } + +_match: + _trans = _mr_parser_indicies[_trans]; + cs = _mr_parser_trans_targs[_trans]; - retval = extract_mr_data(line, result); + if ( _mr_parser_trans_actions[_trans] == 0 ) + goto _again; + + _acts = _mr_parser_actions + _mr_parser_trans_actions[_trans]; + _nacts = (unsigned int) *_acts++; + while ( _nacts-- > 0 ) + { + switch ( *_acts++ ) + { + case 0: +#line 140 "./src/spss/readstat_sav_parse_mr_name.rl" + { + char *mln = (char *)malloc(p - start); + memcpy(mln, start + 1, p - start); + mln[p - start - 1] = '\0'; + *mr_sets = realloc(*mr_sets, ((*n_mr_lines) + 1) * sizeof(mr_set_t)); + retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines]); + if (retval != READSTAT_OK) goto cleanup; + (*n_mr_lines)++; + start = p + 1; + } + break; +#line 442 "./src/spss/readstat_sav_parse_mr_name.c" + } + } + +_again: + if ( cs == 0 ) + goto _out; + if ( ++p != pe ) + goto _resume; + _test_eof: {} + _out: {} + } + +#line 170 "./src/spss/readstat_sav_parse_mr_name.rl" + + if (cs < 4 || p != pe) { + retval = READSTAT_ERROR_BAD_MR_STRING; + goto cleanup; + } + + (void)mr_parser_en_main; + +cleanup: return retval; } diff --git a/src/spss/readstat_sav_parse_mr_name.h b/src/spss/readstat_sav_parse_mr_name.h index 8d52e3e..3975216 100644 --- a/src/spss/readstat_sav_parse_mr_name.h +++ b/src/spss/readstat_sav_parse_mr_name.h @@ -4,6 +4,6 @@ #include "../readstat.h" #include "../readstat_malloc.h" -readstat_error_t parse_mr_line(const char *line, mr_set_t *result); +readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines); #endif // READSTAT_PARSE_MR_NAME_H diff --git a/src/spss/readstat_sav_parse_mr_name.rl b/src/spss/readstat_sav_parse_mr_name.rl index 29d1740..c9afae8 100644 --- a/src/spss/readstat_sav_parse_mr_name.rl +++ b/src/spss/readstat_sav_parse_mr_name.rl @@ -60,13 +60,13 @@ mr_subvariables[mr_subvar_count++] = subvar; } - name = (alnum | '_')+ '=' > extract_mr_name; + nc = (alnum | '_'); # name character + name = nc+ '=' > extract_mr_name; type = ('C' | 'D'){1} > extract_mr_type; counted_value = digit* ' ' > extract_counted_value; label = digit+ ' '+ > extract_label; - nc = (alnum | '_'); # name character - end = (space | '\0'); # token terminator + end = (space | '\0'); # subvar token terminator subvariable = (nc+ end >extract_subvar); main := name type counted_value label subvariable+; @@ -130,9 +130,51 @@ cleanup: readstat_error_t parse_mr_line(const char *line, mr_set_t *result) { - readstat_error_t retval = READSTAT_OK; *result = (mr_set_t){0}; + return extract_mr_data(line, result); +} + +%%{ + machine mr_parser; + + action mr_line { + char *mln = (char *)malloc(p - start); + memcpy(mln, start + 1, p - start); + mln[p - start - 1] = '\0'; + *mr_sets = realloc(*mr_sets, ((*n_mr_lines) + 1) * sizeof(mr_set_t)); + retval = parse_mr_line(mln, &(*mr_sets)[*n_mr_lines]); + if (retval != READSTAT_OK) goto cleanup; + (*n_mr_lines)++; + start = p + 1; + } + line_start = '$'; + line_end = '\n'; + line_char = any - (line_end + line_start); + mr_line = line_start line_char* line_end > mr_line; + main := mr_line+ '\0'; + + write data nofinal noerror; +}%% - retval = extract_mr_data(line, result); +readstat_error_t parse_mr_string(const char *line, mr_set_t **mr_sets, size_t *n_mr_lines) { + readstat_error_t retval = READSTAT_OK; + int cs = 0; + char *p = (char *)line; + char *start = p; + char *pe = p + strlen(p) + 1; + *mr_sets = NULL; + *n_mr_lines = 0; + + %% write init; + %% write exec; + + if (cs < %%{ write first_final; }%% || p != pe) { + retval = READSTAT_ERROR_BAD_MR_STRING; + goto cleanup; + } + + (void)mr_parser_en_main; + +cleanup: return retval; } diff --git a/src/spss/readstat_sav_read.c b/src/spss/readstat_sav_read.c index f9bd72b..fcafa0f 100644 --- a/src/spss/readstat_sav_read.c +++ b/src/spss/readstat_sav_read.c @@ -1,7 +1,6 @@ #include #include -#include #include #include #include @@ -30,10 +29,6 @@ #define DATA_BUFFER_SIZE 65536 #define VERY_LONG_STRING_MAX_LENGTH INT_MAX -// #ifdef _WIN32 -// #define strtok_r(s,d,p) strtok_s(s,d,p) -// #endif - /* Others defined in table below */ /* See http://msdn.microsoft.com/en-us/library/dd317756(VS.85).aspx */ @@ -172,23 +167,7 @@ static readstat_error_t sav_read_multiple_response_sets(size_t data_len, sav_ctx goto cleanup; } - // char *saveptr; - // char *token = strtok_r(mr_string, "$\n", &saveptr); - char *token = strtok(mr_string, "$\n"); - - int num_lines = 0; - while (token != NULL) { - if ((ctx->mr_sets = readstat_realloc(ctx->mr_sets, (num_lines + 1) * sizeof(mr_set_t))) == NULL) { - retval = READSTAT_ERROR_MALLOC; - goto cleanup; - } - retval = parse_mr_line(token, &ctx->mr_sets[num_lines]); - if (retval != READSTAT_OK) goto cleanup; - num_lines++; - // token = strtok_r(NULL, "$\n", &saveptr); - token = strtok(NULL, "$\n"); - } - ctx->multiple_response_sets_length = num_lines; + retval = parse_mr_string(mr_string, &ctx->mr_sets, &ctx->multiple_response_sets_length); cleanup: free(mr_string);