Skip to content

Commit

Permalink
Fix #77 Support for UTF-8 string literals (#260)
Browse files Browse the repository at this point in the history
  • Loading branch information
yoyocat authored Aug 2, 2024
1 parent f962058 commit 381cd65
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 36 deletions.
7 changes: 6 additions & 1 deletion lib/util/Strings.v3
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,18 @@ component Strings {
var i = pos + 1, buf = StringBuilder.new();
while (i < max) {
var ch = a[i];
if (ch < ' ' || ch > 127) return (pos - i, null);
if (ch < ' ') return (pos - i, null);
if (ch == '\"') return (1 + i - pos, buf.toString());
if (ch == '\\') {
var p = Chars.parseEscape(a, i + 1);
if (p.0 <= 0) return (pos - i + p.0, null);
else buf.putc(p.1);
i = i + p.0;
} else if (ch > 127) {
var p = Utf8.decode1(a[i ... max]);
if (p.0 <= 0) return (pos - i, null);
else buf.putr(a[i ..+ p.0]);
i += p.0 - 1;
} else {
buf.putc(ch);
}
Expand Down
80 changes: 45 additions & 35 deletions lib/util/Utf8.v3
Original file line number Diff line number Diff line change
Expand Up @@ -46,47 +46,57 @@ component Utf8 {
if (codepoint >= 0xD800 && codepoint <= 0xDFFF) return false; // UTF-16 surrogates
return true;
}
// Decode a UTF-8 sequences of bytes to one Unicode codepoint. Returns a
// pair of the status (# of bytes read if successful, <= 0 if failure),
// and the value.
def decode1(str: Range<byte>) -> (/*status:*/int, /*value:*/u32) {
var i = 0;
var end = str.length;
var b = str[i++], codepoint: u32 = b;
if ((b & 0x80) == 0) {
return (i, codepoint);
}

// Compute how many continuation bytes there are.
var extra = 0, min = 0u;
if ((b & 0b11100000u) == 0b11000000u) {
extra = 1;
min = 0x80u;
} else if ((b & 0b11110000u) == 0b11100000u) {
extra = 2;
min = 0x800u;
} else if ((b & 0b11111000u) == 0b11110000u) {
extra = 3;
min = 0x10000u;
} else {
return (0 - i, 0); // completely invalid character
}

// Check the continuation bytes and compute the codepoint.
var next = i + extra;
codepoint = codepoint & ~(0xFFFFFFFFu << u5.view(6 - extra));
if (next > end) return (0 - i, 0);
while (i < next) {
var b = str[i++];
if ((b & 0b11000000u) != 0b10000000u) return (0 - i, 0);
codepoint = codepoint << 6 | (b & 0b00111111u);
}
// Check for invalid codepoints.
if (!isValidCodepoint(codepoint)) return (0 - i, 0);
// Check for overlong character.
if (codepoint < min) return (0 - i, 0);
return (i, codepoint);
}
// Decode a UTF-8 sequences of bytes to Unicode codepoints. Applies the {f} function for
// each codepoint. Returns {-1} if successful, or the byte offset of the invalid
// character otherwise.
def decodeUnicode(str: Array<byte>, start: int, end: int, f: u32 -> void) -> int {
var i = start;
while (i < end) {
var b = str[i++], codepoint: u32 = b;
if ((b & 0x80) == 0) {
f(codepoint);
continue;
}

// Compute how many continuation bytes there are.
var extra = 0, min = 0u;
if ((b & 0b11100000u) == 0b11000000u) {
extra = 1;
min = 0x80u;
} else if ((b & 0b11110000u) == 0b11100000u) {
extra = 2;
min = 0x800u;
} else if ((b & 0b11111000u) == 0b11110000u) {
extra = 3;
min = 0x10000u;
} else {
return i; // completely invalid character
}

// Check the continuation bytes and compute the codepoint.
var next = i + extra;
codepoint = codepoint & ~(0xFFFFFFFFu << u5.view(6 - extra));
if (next > end) return i;
while (i < next) {
var b = str[i++];
if ((b & 0b11000000u) != 0b10000000u) return i;
codepoint = codepoint << 6 | (b & 0b00111111u);
}
// Check for invalid codepoints.
if (!isValidCodepoint(codepoint)) return i;
// Check for overlong character.
if (codepoint < min) return i;
f(codepoint);
var p = decode1(str[i ... end]);
if (p.0 <= 0) return i - p.0;
f(p.1);
i += p.0;
}
return -1;
}
Expand Down
6 changes: 6 additions & 0 deletions test/core/parser/str03.v3
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
//@parse
class A {
var m: string = "--你好,世界!--";
var n: string = "😀";
var o: string = "👍 💯 🥇";
}

0 comments on commit 381cd65

Please sign in to comment.