From df5bc80e2de84a8dd52799d99f4f3211c43dd987 Mon Sep 17 00:00:00 2001
From: Alex Rebert <alexandre.rebert@gmail.com>
Date: Tue, 26 Oct 2021 23:54:14 -0400
Subject: [PATCH 1/4] Fix memset'ing of buffer in strrep

`strlen(buf)` is not guaranteed to return the size of the buffer, as it
might contain some null bytes in the middle.

Fixes #1001
---
 src/tmbstr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/tmbstr.c b/src/tmbstr.c
index e3526987c..447c6dd29 100644
--- a/src/tmbstr.c
+++ b/src/tmbstr.c
@@ -255,7 +255,7 @@ void TY_(strrep)(tmbstr buffer, ctmbstr str, ctmbstr rep)
         if(p)
         {
             char buf[1024];
-            memset(buf,'\0',strlen(buf));
+            memset(buf,'\0',sizeof(buf));
 
             if(buffer == p)
             {

From 8b8b3deb451ea96b51694d8a43dfb87e36d36b47 Mon Sep 17 00:00:00 2001
From: Alex Rebert <alexandre.rebert@gmail.com>
Date: Wed, 27 Oct 2021 14:46:17 -0400
Subject: [PATCH 2/4] Fix infinite loop in ParsePre

The loop was introduced 91f29ea7 when switching to a non-recursive
algorithm. `InlineDup` should not be called when `ParsePre` restarts.

This should fix the oss-fuzz build failure @ https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=36721
---
 .../cases/github-cases/case-1008.conf         |  1 +
 .../cases/github-cases/case-1008@1.html       |  1 +
 .../cases/github-expects/case-1008.html       |  8 ++++++
 .../cases/github-expects/case-1008.txt        | 26 +++++++++++++++++++
 src/parser.c                                  |  3 ++-
 5 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100755 regression_testing/cases/github-cases/case-1008.conf
 create mode 100644 regression_testing/cases/github-cases/case-1008@1.html
 create mode 100644 regression_testing/cases/github-expects/case-1008.html
 create mode 100644 regression_testing/cases/github-expects/case-1008.txt

diff --git a/regression_testing/cases/github-cases/case-1008.conf b/regression_testing/cases/github-cases/case-1008.conf
new file mode 100755
index 000000000..9e59ba031
--- /dev/null
+++ b/regression_testing/cases/github-cases/case-1008.conf
@@ -0,0 +1 @@
+force-output: yes
diff --git a/regression_testing/cases/github-cases/case-1008@1.html b/regression_testing/cases/github-cases/case-1008@1.html
new file mode 100644
index 000000000..5c38c1966
--- /dev/null
+++ b/regression_testing/cases/github-cases/case-1008@1.html
@@ -0,0 +1 @@
+<pre <u
diff --git a/regression_testing/cases/github-expects/case-1008.html b/regression_testing/cases/github-expects/case-1008.html
new file mode 100644
index 000000000..c1956a414
--- /dev/null
+++ b/regression_testing/cases/github-expects/case-1008.html
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+<head>
+<title></title>
+</head>
+<body>
+</body>
+</html>
diff --git a/regression_testing/cases/github-expects/case-1008.txt b/regression_testing/cases/github-expects/case-1008.txt
new file mode 100644
index 000000000..0d7ddfd9e
--- /dev/null
+++ b/regression_testing/cases/github-expects/case-1008.txt
@@ -0,0 +1,26 @@
+line 1 column 1 - Warning: <pre> missing '>' for end of tag
+line 1 column 1 - Warning: missing <!DOCTYPE> declaration
+line 1 column 1 - Warning: inserting implicit <body>
+line 1 column 6 - Warning: missing </u>
+line 1 column 1 - Warning: missing </pre>
+line 1 column 1 - Warning: inserting missing 'title' element
+line 1 column 6 - Warning: trimming empty <u>
+line 1 column 1 - Warning: trimming empty <pre>
+Info: Document content looks like HTML5
+Tidy found 8 warnings and 0 errors!
+
+One or more empty elements were present in the source document but
+dropped on output. If these elements are necessary or you don't want
+this behavior, then consider setting the option "drop-empty-elements"
+to no.
+
+About HTML Tidy: https://github.com/htacg/tidy-html5
+Bug reports and comments: https://github.com/htacg/tidy-html5/issues
+Official mailing list: https://lists.w3.org/Archives/Public/public-htacg/
+Latest HTML specification: https://html.spec.whatwg.org/multipage/
+Validate your HTML documents: https://validator.w3.org/nu/
+Lobby your company to join the W3C: https://www.w3.org/Consortium
+
+Do you speak a language other than English, or a different variant of 
+English? Consider helping us to localize HTML Tidy. For details please see 
+https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md
diff --git a/src/parser.c b/src/parser.c
index fd0661133..3b77753d8 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -4458,9 +4458,10 @@ Node* TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode)
             DEBUG_LOG_EXIT;
             return NULL;
         }
+
+        TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
     }
 
-    TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
 
     while ( state != STATE_COMPLETE )
     {

From 216d8c3031db15b47d672280a5b2e562cb826932 Mon Sep 17 00:00:00 2001
From: Alex Rebert <alexandre.rebert@gmail.com>
Date: Wed, 27 Oct 2021 14:48:58 -0400
Subject: [PATCH 3/4] Fix out-of-bounds write in NormalizeSpaces

NormalizeSpaces decodes and re-encodes UTF-8 characters while looking to
replace non-breaking spaces with regular spaces. When the UTF-8 decoding
hits an error, a replacement character (0xFFFD) is returned and
re-encoded as a 3-byte UTF-8 character. In some cases, this increases
the size of strings, leading to writing past the end of the allocated
buffer.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13191.
---
 src/clean.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/clean.c b/src/clean.c
index e0cd3baae..48e3a527a 100644
--- a/src/clean.c
+++ b/src/clean.c
@@ -1824,13 +1824,23 @@ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
                 c = (byte) lexer->lexbuf[i];
 
                 /* look for UTF-8 multibyte character */
+                int bytes = 0;
                 if ( c > 0x7F )
-                    i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
+                    bytes = TY_(GetUTF8)( lexer->lexbuf + i, &c );
 
                 if ( c == 160 )
                     c = ' ';
 
-                p = TY_(PutUTF8)(p, c);
+                /* don't copy replacement char on invalid UTF-8, as it might */
+                /* be larger than original char and overflow the buffer */
+                if(bytes > 0) {
+                    p = TY_(PutUTF8)(p, c);
+                } else {
+                    *p = lexer->lexbuf[i];
+                    p++;
+                }
+
+                i += bytes;
             }
             node->end = p - lexer->lexbuf;
         }

From 5f3eb6605ad343049d96d38ec4f2ccb4688bcfec Mon Sep 17 00:00:00 2001
From: Alex Rebert <alexandre.rebert@gmail.com>
Date: Tue, 1 Feb 2022 20:20:34 -0500
Subject: [PATCH 4/4] mapped_io: fix mapped_eof on windows

The mmaped IO implementation on windows does not always increment `pos`
on each `getByte`. It relies on an incrementing pointer `iter`, and
update `pos` only when a new chunk gets maped.

However, mmaped_eof was only considering `pos` and not `iter`, and
therefore was sometimes incorrectly returning false until a later call
to `getByte` would update `pos`.
---
 src/mappedio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mappedio.c b/src/mappedio.c
index c2c403d9a..c29d354a0 100644
--- a/src/mappedio.c
+++ b/src/mappedio.c
@@ -156,7 +156,7 @@ static int TIDY_CALL mapped_getByte( void *sourceData )
 static Bool TIDY_CALL mapped_eof( void *sourceData )
 {
     MappedFileSource *data = sourceData;
-    return ( data->pos >= data->size );
+    return ( data->pos + (data->iter - data->view) >= data->size );
 }
 
 static void TIDY_CALL mapped_ungetByte( void *sourceData, byte ARG_UNUSED(bt) )