From 1cbe050eea371cb45bd2c54ea6df5c9b163fbc5c Mon Sep 17 00:00:00 2001
From: Robin Leroy <eggrobin@unicode.org>
Date: Wed, 5 Jun 2024 05:15:23 +0200
Subject: [PATCH] HST invariants (#850)

Fix #848
---
 .../org/unicode/text/UCD/UnicodeInvariantTest.txt  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
index 3eb759238..3b3f3c35a 100644
--- a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
+++ b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
@@ -863,6 +863,20 @@ Let $TwoVietnameseReadingMarks = [\p{U15.1.0:ccc=6}]
 # an LV or V, respectively.
 [\p{NFC_QC=Maybe}&\p{ccc=0}] ⊆ [\p{GCB=Extend}\p{GCB=T}\p{GCB=V}]
 
+# ICU relies on this to avoid carrying data for HST which would be mostly
+# redundant with GCB.  If this breaks, it should be noted on the landing page,
+# and ICU-TC should be notified.
+# See https://github.com/unicode-org/icu/pull/3026.
+\p{HST=V} = [\p{GCB=V} & [\u0000-\uFFFF]]
+# A more principled (if less practically useful) statement is that the
+# dual-conjoining Hangul characters are exactly the Hangul vowels.
+\p{HST=V} = [\p{GCB=V} & \p{Script=Hangul}]
+# The other types are still straightforwardly related to their GCB counterparts.
+\p{HST=L}   = \p{GCB=L}
+\p{HST=LV}  = \p{GCB=LV}
+\p{HST=LVT} = \p{GCB=LVT}
+\p{HST=T}   = \p{GCB=T}
+
 ##########################
 # Emoji
 ##########################