diff --git a/regression_testing/cases/github-cases/case-1008.conf b/regression_testing/cases/github-cases/case-1008.conf new file mode 100755 index 00000000..9e59ba03 --- /dev/null +++ b/regression_testing/cases/github-cases/case-1008.conf @@ -0,0 +1 @@ +force-output: yes diff --git a/regression_testing/cases/github-cases/case-1008@1.html b/regression_testing/cases/github-cases/case-1008@1.html new file mode 100644 index 00000000..5c38c196 --- /dev/null +++ b/regression_testing/cases/github-cases/case-1008@1.html @@ -0,0 +1 @@ +
+ + ++ + + + diff --git a/regression_testing/cases/github-expects/case-1008.txt b/regression_testing/cases/github-expects/case-1008.txt new file mode 100644 index 00000000..0d7ddfd9 --- /dev/null +++ b/regression_testing/cases/github-expects/case-1008.txt @@ -0,0 +1,26 @@ +line 1 column 1 - Warning: missing '>' for end of tag +line 1 column 1 - Warning: missing declaration +line 1 column 1 - Warning: inserting implicit +line 1 column 6 - Warning: missing +line 1 column 1 - Warning: missing+line 1 column 1 - Warning: inserting missing 'title' element +line 1 column 6 - Warning: trimming empty +line 1 column 1 - Warning: trimming empty+Info: Document content looks like HTML5 +Tidy found 8 warnings and 0 errors! + +One or more empty elements were present in the source document but +dropped on output. If these elements are necessary or you don't want +this behavior, then consider setting the option "drop-empty-elements" +to no. + +About HTML Tidy: https://github.com/htacg/tidy-html5 +Bug reports and comments: https://github.com/htacg/tidy-html5/issues +Official mailing list: https://lists.w3.org/Archives/Public/public-htacg/ +Latest HTML specification: https://html.spec.whatwg.org/multipage/ +Validate your HTML documents: https://validator.w3.org/nu/ +Lobby your company to join the W3C: https://www.w3.org/Consortium + +Do you speak a language other than English, or a different variant of +English? Consider helping us to localize HTML Tidy. For details please see +https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md diff --git a/src/clean.c b/src/clean.c index e0cd3baa..48e3a527 100644 --- a/src/clean.c +++ b/src/clean.c @@ -1824,13 +1824,23 @@ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node) c = (byte) lexer->lexbuf[i]; /* look for UTF-8 multibyte character */ + int bytes = 0; if ( c > 0x7F ) - i += TY_(GetUTF8)( lexer->lexbuf + i, &c ); + bytes = TY_(GetUTF8)( lexer->lexbuf + i, &c ); if ( c == 160 ) c = ' '; - p = TY_(PutUTF8)(p, c); + /* don't copy replacement char on invalid UTF-8, as it might */ + /* be larger than original char and overflow the buffer */ + if(bytes > 0) { + p = TY_(PutUTF8)(p, c); + } else { + *p = lexer->lexbuf[i]; + p++; + } + + i += bytes; } node->end = p - lexer->lexbuf; } diff --git a/src/mappedio.c b/src/mappedio.c index c2c403d9..c29d354a 100644 --- a/src/mappedio.c +++ b/src/mappedio.c @@ -156,7 +156,7 @@ static int TIDY_CALL mapped_getByte( void *sourceData ) static Bool TIDY_CALL mapped_eof( void *sourceData ) { MappedFileSource *data = sourceData; - return ( data->pos >= data->size ); + return ( data->pos + (data->iter - data->view) >= data->size ); } static void TIDY_CALL mapped_ungetByte( void *sourceData, byte ARG_UNUSED(bt) ) diff --git a/src/parser.c b/src/parser.c index fd066113..3b77753d 100644 --- a/src/parser.c +++ b/src/parser.c @@ -4458,9 +4458,10 @@ Node* TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) DEBUG_LOG_EXIT; return NULL; } + + TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */ } - TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */ while ( state != STATE_COMPLETE ) { diff --git a/src/tmbstr.c b/src/tmbstr.c index e3526987..447c6dd2 100644 --- a/src/tmbstr.c +++ b/src/tmbstr.c @@ -255,7 +255,7 @@ void TY_(strrep)(tmbstr buffer, ctmbstr str, ctmbstr rep) if(p) { char buf[1024]; - memset(buf,'\0',strlen(buf)); + memset(buf,'\0',sizeof(buf)); if(buffer == p) {