Fix #77 Support for UTF-8 string literals (#260)

titzer · Aug 2, 2024 · 381cd65 · 381cd65
1 parent f962058
commit 381cd65
Show file tree

Hide file tree

Showing 3 changed files with 57 additions and 36 deletions.
diff --git a/lib/util/Strings.v3 b/lib/util/Strings.v3
@@ -135,13 +135,18 @@ component Strings {
 		var i = pos + 1, buf = StringBuilder.new();
 		while (i < max) {
 			var ch = a[i];
-			if (ch < ' ' || ch > 127) return (pos - i, null);
+			if (ch < ' ') return (pos - i, null);
 			if (ch == '\"') return (1 + i - pos, buf.toString());
 			if (ch == '\\') {
 				var p = Chars.parseEscape(a, i + 1);
 				if (p.0 <= 0) return (pos - i + p.0, null);
 				else buf.putc(p.1);
 				i = i + p.0;
+			} else if (ch > 127) {
+				var p = Utf8.decode1(a[i ... max]);
+				if (p.0 <= 0) return (pos - i, null);
+				else buf.putr(a[i ..+ p.0]);
+				i += p.0 - 1;
 			} else {
 				buf.putc(ch);
 			}

diff --git a/lib/util/Utf8.v3 b/lib/util/Utf8.v3
@@ -46,47 +46,57 @@ component Utf8 {
 		if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return false; // UTF-16 surrogates
 		return true;
 	}
+	// Decode a UTF-8 sequences of bytes to one Unicode codepoint. Returns a
+	// pair of the status (# of bytes read if successful, <= 0 if failure),
+	// and the value.
+	def decode1(str: Range<byte>) -> (/*status:*/int, /*value:*/u32) {
+		var i = 0;
+		var end = str.length;
+		var b = str[i++], codepoint: u32 = b;
+		if ((b & 0x80) == 0) {
+			return (i, codepoint);
+		}
+
+		// Compute how many continuation bytes there are.
+		var extra = 0, min = 0u;
+		if ((b & 0b11100000u) == 0b11000000u) {
+			extra = 1;
+			min = 0x80u;
+		} else if ((b & 0b11110000u) == 0b11100000u) {
+			extra = 2;
+			min = 0x800u;
+		} else if ((b & 0b11111000u) == 0b11110000u) {
+			extra = 3;
+			min = 0x10000u;
+		} else {
+			return (0 - i, 0); // completely invalid character
+		}
+
+		// Check the continuation bytes and compute the codepoint.
+		var next = i + extra;
+		codepoint = codepoint & ~(0xFFFFFFFFu << u5.view(6 - extra));
+		if (next > end) return (0 - i, 0);
+		while (i < next) {
+			var b = str[i++];
+			if ((b & 0b11000000u) != 0b10000000u) return (0 - i, 0);
+			codepoint = codepoint << 6 | (b & 0b00111111u);
+		}
+		// Check for invalid codepoints.
+		if (!isValidCodepoint(codepoint)) return (0 - i, 0);
+		// Check for overlong character.
+		if (codepoint < min) return (0 - i, 0);
+		return (i, codepoint);
+	}
 	// Decode a UTF-8 sequences of bytes to Unicode codepoints. Applies the {f} function for
 	// each codepoint. Returns {-1} if successful, or the byte offset of the invalid
 	// character otherwise.
 	def decodeUnicode(str: Array<byte>, start: int, end: int, f: u32 -> void) -> int {
 		var i = start;
 		while (i < end) {
-			var b = str[i++], codepoint: u32 = b;
-			if ((b & 0x80) == 0) {
-				f(codepoint);
-				continue;
-			}
-
-			// Compute how many continuation bytes there are.
-			var extra = 0, min = 0u;
-			if ((b & 0b11100000u) == 0b11000000u) {
-				extra = 1;
-				min = 0x80u;
-			} else if ((b & 0b11110000u) == 0b11100000u) {
-				extra = 2;
-				min = 0x800u;
-			} else if ((b & 0b11111000u) == 0b11110000u) {
-				extra = 3;
-				min = 0x10000u;
-			} else {
-				return i; // completely invalid character
-			}
-
-			// Check the continuation bytes and compute the codepoint.
-			var next = i + extra;
-			codepoint = codepoint & ~(0xFFFFFFFFu << u5.view(6 - extra));
-			if (next > end) return i;
-			while (i < next) {
-				var b = str[i++];
-				if ((b & 0b11000000u) != 0b10000000u) return i;
-				codepoint = codepoint << 6 | (b & 0b00111111u);
-			}
-			// Check for invalid codepoints.
-			if (!isValidCodepoint(codepoint)) return i;
-			// Check for overlong character.
-			if (codepoint < min) return i;
-			f(codepoint);
+			var p = decode1(str[i ... end]);
+			if (p.0 <= 0) return i - p.0;
+			f(p.1);
+			i += p.0;
 		}
 		return -1;
 	}

diff --git a/test/core/parser/str03.v3 b/test/core/parser/str03.v3
@@ -0,0 +1,6 @@
+//@parse
+class A {
+	var m: string = "--你好，世界！--";
+	var n: string = "😀";
+	var o: string = "👍 💯 🥇";
+}