Fix line numbers for invariant test failures

unicode-org · eggrobin · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
commit 128ce0abe15932cd6ef70e2163fd0304593734be
diff --git a/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java b/unicodetools/src/main/java/org/unicode/text/UCD/TestUnicodeInvariants.java
@@ -184,7 +184,7 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                 }
                 final var noComments = new StringBuilder();
                 final List<String> lines = new ArrayList<>();
-                final List<Integer> lineBeginnings = new ArrayList();
+                final List<Integer> lineBeginnings = new ArrayList<>();
                 try (final BufferedReader in = getInputReader(inputFile)) {
                     in.lines()
                             .forEach(
@@ -234,7 +234,24 @@ public static int testInvariants(String inputFile, String suffix, boolean doRang
                         position -> {
                             for (int i = 0; i < lineBeginnings.size(); ++i) {
                                 if (lineBeginnings.get(i) > position.getIndex()) {
-                                    return i; // 1-based line number.
+                                    // The error is before the beginning of line i (0-based), thus
+                                    // on line i (1-based).
+                                    return i;
+                                } else if (lineBeginnings.get(i) == position.getIndex()) {
+                                    // The position in a beginning of line; this happens when an
+                                    // statement has been successfully parsed, but then fails for
+                                    // nonsyntactic reasons.
+                                    // The parse position is then the beginning of the next
+                                    // statement.
+                                    // Backtrack to the last nonempty line (ignoring comments),
+                                    // which is the last line of the failing statement.
+                                    int indexInTrimmedSource = position.getIndex();
+                                    while (lineBeginnings.get(i) == indexInTrimmedSource
+                                            && indexInTrimmedSource > 0) {
+                                        --indexInTrimmedSource;
+                                        --i;
+                                    }
+                                    return i + 1;
                                 }
                             }
                             return lineBeginnings.size();

diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt
 # https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type

 # “Unassigned characters, private use characters, surrogates, non-whitespace control characters.”
 \p{Identifier_Type=Not_Character} = [\p{gc=Cn}\p{gc=Co}\p{gc=Cs}\p{gc=Cc}-\p{White_Space}]

 # “Multiple values are not assigned to characters with strong restrictions:
 # Not_Character, Deprecated, Default_Ignorable, Not_NFKC.”
diff --git a/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt b/unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt
 ## Modifier_Combining_Mark stability.  Not yet testable, since MCM does not exist before 16.0.
 # NOTICE TO THE MAINTAINER: The following line is intended to fail after 16.0;
 # When it does, remove it and uncomment the line after it.
 \p{gc=Cn} = \p{U16:gc=Cn}  # ☚ REMOVE ME WHEN I FAIL…
 # … AND UNCOMMENT ME. ☛ # In \P{U-1:gc=Cn}, Modifier_Combining_Mark = U-1:Modifier_Combining_Mark

 ##########################
 Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T|WG|XG|BXG|SW|HZZ|HP|HZWG|SZWG|HZT|HZZP|HPWG|HZW|HZZZ|PG|Q|HXG|SZP)$/}

 # See https://www.unicode.org/review/pri502/feedback.html#ID20240523095709.
 $cjkStrokes ⊆ \p{scx=Hani}

 # The Equivalent_Unified_Ideograph property is applicable to CJK strokes and radicals.
 # Its value is a single unified ideograph.
 [\p{gc=Lo}&\P{InPC=NA}&\P{InPC=/(Left|Right)/}] ⊆ [\p{InSC=Consonant_Preceding_Repha}\p{InSC=Consonant_Prefixed}]

 # Script Extensions (mostly testing the proper handling of multivalued properties).
 \p{sc=Deva} ⊂ \p{scx=Deva}
 [\p{scx=Deva} & \p{scx=Beng}] ⊃ []

 # DoNotEmit.txt.
diff --git a/unicodetools/data/ucd/dev/DerivedAge.txt b/unicodetools/data/ucd/dev/DerivedAge.txt
 # DerivedAge-17.0.0.txt
 # Date: 2024-09-16, 15:55:43 GMT
 # © 2024 Unicode®, Inc.
 # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
	# https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type

	# “Unassigned characters, private use characters, surrogates, non-whitespace control characters.”
	\p{Identifier_Type=Not_Character} = [\p{gc=Cn}\p{gc=Co}\p{gc=Cs}\p{gc=Cc}-\p{White_Space}]
Check notice on line 9 in unicodetools/src/main/resources/org/unicode/text/UCD/SecurityInvariantTest.txt GitHub Actions / Check security data invariants Invariant test failure `Expected empty, got: 311 [\U00012550-\U00012686] In \p{Identifier_Type=Not_Character} But Not In [\p{gc=Cn}\p{gc=Co}\p{gc=Cs}\p{gc=Cc}-\p{White_Space}] 12550..12686 # [311] (�..�) CUNEIFORM NUMERIC SIGN ONE N01..CUNEIFORM NUMERIC SIGN ONE N36 FLAT`

	# “Multiple values are not assigned to characters with strong restrictions:
	# Not_Character, Deprecated, Default_Ignorable, Not_NFKC.”
	## Modifier_Combining_Mark stability. Not yet testable, since MCM does not exist before 16.0.
	# NOTICE TO THE MAINTAINER: The following line is intended to fail after 16.0;
	# When it does, remove it and uncomment the line after it.
	\p{gc=Cn} = \p{U16:gc=Cn} # ☚ REMOVE ME WHEN I FAIL…
Check failure on line 752 in unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt GitHub Actions / Check UCD consistency, invariants, smoke-test generators Invariant test failure `Expected empty, got: 311 [\U00012550-\U00012686] In \p{U16:gc=Cn} But Not In \p{gc=Cn} 12550..12686 # [311] (�..�) CUNEIFORM NUMERIC SIGN ONE N01..CUNEIFORM NUMERIC SIGN ONE N36 FLAT`
	# … AND UNCOMMENT ME. ☛ # In \P{U-1:gc=Cn}, Modifier_Combining_Mark = U-1:Modifier_Combining_Mark

	##########################
	Let $nonIdeographicStrokes := \p{Name=/^CJK STROKE (T\|WG\|XG\|BXG\|SW\|HZZ\|HP\|HZWG\|SZWG\|HZT\|HZZP\|HPWG\|HZW\|HZZZ\|PG\|Q\|HXG\|SZP)$/}

	# See https://www.unicode.org/review/pri502/feedback.html#ID20240523095709.
	$cjkStrokes ⊆ \p{scx=Hani}
Check failure on line 1261 in unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt GitHub Actions / Check UCD consistency, invariants, smoke-test generators Parse error `Internal error: Script_Extensions (Catalog) doesn't contain Pcun: [] $cjkStrokes ⊆ ☜\p{scx=Hani}`

	# The Equivalent_Unified_Ideograph property is applicable to CJK strokes and radicals.
	# Its value is a single unified ideograph.
	[\p{gc=Lo}&\P{InPC=NA}&\P{InPC=/(Left\|Right)/}] ⊆ [\p{InSC=Consonant_Preceding_Repha}\p{InSC=Consonant_Prefixed}]

	# Script Extensions (mostly testing the proper handling of multivalued properties).
	\p{sc=Deva} ⊂ \p{scx=Deva}
Check failure on line 1322 in unicodetools/src/main/resources/org/unicode/text/UCD/UnicodeInvariantTest.txt GitHub Actions / Check UCD consistency, invariants, smoke-test generators Parse error `Internal error: Script_Extensions (Catalog) doesn't contain Pcun: [] \p{sc=Deva} ⊂ ☜\p{scx=Deva}`
	[\p{scx=Deva} & \p{scx=Beng}] ⊃ []

	# DoNotEmit.txt.
	# DerivedAge-17.0.0.txt
Check warning on line 1 in unicodetools/data/ucd/dev/DerivedAge.txt GitHub Actions / Draft unless approved Not in the 17.0 pipeline `These characters are neither accepted for Unicode 17.0, nor for any specific version of Unicode, nor are they provisionally assigned. The Age property values for new characters are likely incorrect right now. They will be recomputed after the UTC accepts their encoding and this pull request is updated for the target version.`
	# Date: 2024-09-16, 15:55:43 GMT
	# © 2024 Unicode®, Inc.
	# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.