Skip to content

Commit

Permalink
Optimize String _find_upper and _find_lower by handling low-bit c…
Browse files Browse the repository at this point in the history
…haracters (including normal latin) explicitly.

Move `_find_upper` and `_find_lower` to non-generated `char_utils.h`.
  • Loading branch information
Ivorforce committed Jan 30, 2025
1 parent ee4acfb commit 97a14e5
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 214 deletions.
55 changes: 55 additions & 0 deletions core/string/char_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
#include "core/typedefs.h"

#include "char_range.inc"
#include "ucaps.h"

#include <array> // for std::size

#define BSEARCH_CHAR_RANGE(m_array) \
int low = 0; \
Expand Down Expand Up @@ -132,4 +135,56 @@ constexpr bool is_underscore(char32_t p_char) {
return (p_char == '_');
}

static inline char32_t _find_upper(const char32_t ch) {
if (ch < 0x00FF) {
// Optimize common latin characters by skipping binary search.
const bool is_lower = (ch >= 'a' && ch <= 'z') || (ch > 224);
return is_lower ? (ch & (~32)) : ch; // Subtract 0x0020
} // else binary search

int low = 0;
int high = std::size(caps_table) - 1;
int middle;

while (low <= high) {
middle = (low + high) / 2;

if (ch < caps_table[middle][0]) {
high = middle - 1; // Search low end of array.
} else if (caps_table[middle][0] < ch) {
low = middle + 1; // Search high end of array.
} else {
return caps_table[middle][1];
}
}

return ch;
}

static inline char32_t _find_lower(const char32_t ch) {
if (ch < 0x00FF) {
// Optimize common latin characters by skipping binary search.
const bool is_upper = (ch >= 'A' && ch <= 'Z') || (ch >= 192 && ch <= 222);
return is_upper ? (ch | 0x0020) : ch; // Add 0x0020
} // else binary search

int low = 0;
int high = std::size(reverse_caps_table) - 1;
int middle;

while (low <= high) {
middle = (low + high) / 2;

if (ch < reverse_caps_table[middle][0]) {
high = middle - 1; // Search low end of array.
} else if (reverse_caps_table[middle][0] < ch) {
low = middle + 1; // Search high end of array.
} else {
return reverse_caps_table[middle][1];
}
}

return ch;
}

#endif // CHAR_UTILS_H
160 changes: 2 additions & 158 deletions core/string/ucaps.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,67 +33,7 @@

// This file was generated using the `misc/scripts/ucaps_fetch.py` script.

#define LTU_LEN 1477
#define UTL_LEN 1460

static const int caps_table[LTU_LEN][2] = {
{ 0x0061, 0x0041 },
{ 0x0062, 0x0042 },
{ 0x0063, 0x0043 },
{ 0x0064, 0x0044 },
{ 0x0065, 0x0045 },
{ 0x0066, 0x0046 },
{ 0x0067, 0x0047 },
{ 0x0068, 0x0048 },
{ 0x0069, 0x0049 },
{ 0x006A, 0x004A },
{ 0x006B, 0x004B },
{ 0x006C, 0x004C },
{ 0x006D, 0x004D },
{ 0x006E, 0x004E },
{ 0x006F, 0x004F },
{ 0x0070, 0x0050 },
{ 0x0071, 0x0051 },
{ 0x0072, 0x0052 },
{ 0x0073, 0x0053 },
{ 0x0074, 0x0054 },
{ 0x0075, 0x0055 },
{ 0x0076, 0x0056 },
{ 0x0077, 0x0057 },
{ 0x0078, 0x0058 },
{ 0x0079, 0x0059 },
{ 0x007A, 0x005A },
{ 0x00B5, 0x039C },
{ 0x00E0, 0x00C0 },
{ 0x00E1, 0x00C1 },
{ 0x00E2, 0x00C2 },
{ 0x00E3, 0x00C3 },
{ 0x00E4, 0x00C4 },
{ 0x00E5, 0x00C5 },
{ 0x00E6, 0x00C6 },
{ 0x00E7, 0x00C7 },
{ 0x00E8, 0x00C8 },
{ 0x00E9, 0x00C9 },
{ 0x00EA, 0x00CA },
{ 0x00EB, 0x00CB },
{ 0x00EC, 0x00CC },
{ 0x00ED, 0x00CD },
{ 0x00EE, 0x00CE },
{ 0x00EF, 0x00CF },
{ 0x00F0, 0x00D0 },
{ 0x00F1, 0x00D1 },
{ 0x00F2, 0x00D2 },
{ 0x00F3, 0x00D3 },
{ 0x00F4, 0x00D4 },
{ 0x00F5, 0x00D5 },
{ 0x00F6, 0x00D6 },
{ 0x00F8, 0x00D8 },
{ 0x00F9, 0x00D9 },
{ 0x00FA, 0x00DA },
{ 0x00FB, 0x00DB },
{ 0x00FC, 0x00DC },
{ 0x00FD, 0x00DD },
{ 0x00FE, 0x00DE },
static const char32_t caps_table[][2] = {
{ 0x00FF, 0x0178 },
{ 0x0101, 0x0100 },
{ 0x0103, 0x0102 },
Expand Down Expand Up @@ -1516,63 +1456,7 @@ static const int caps_table[LTU_LEN][2] = {
{ 0x1E943, 0x1E921 },
};

static const int reverse_caps_table[UTL_LEN][2] = {
{ 0x0041, 0x0061 },
{ 0x0042, 0x0062 },
{ 0x0043, 0x0063 },
{ 0x0044, 0x0064 },
{ 0x0045, 0x0065 },
{ 0x0046, 0x0066 },
{ 0x0047, 0x0067 },
{ 0x0048, 0x0068 },
{ 0x0049, 0x0069 },
{ 0x004A, 0x006A },
{ 0x004B, 0x006B },
{ 0x004C, 0x006C },
{ 0x004D, 0x006D },
{ 0x004E, 0x006E },
{ 0x004F, 0x006F },
{ 0x0050, 0x0070 },
{ 0x0051, 0x0071 },
{ 0x0052, 0x0072 },
{ 0x0053, 0x0073 },
{ 0x0054, 0x0074 },
{ 0x0055, 0x0075 },
{ 0x0056, 0x0076 },
{ 0x0057, 0x0077 },
{ 0x0058, 0x0078 },
{ 0x0059, 0x0079 },
{ 0x005A, 0x007A },
{ 0x00C0, 0x00E0 },
{ 0x00C1, 0x00E1 },
{ 0x00C2, 0x00E2 },
{ 0x00C3, 0x00E3 },
{ 0x00C4, 0x00E4 },
{ 0x00C5, 0x00E5 },
{ 0x00C6, 0x00E6 },
{ 0x00C7, 0x00E7 },
{ 0x00C8, 0x00E8 },
{ 0x00C9, 0x00E9 },
{ 0x00CA, 0x00EA },
{ 0x00CB, 0x00EB },
{ 0x00CC, 0x00EC },
{ 0x00CD, 0x00ED },
{ 0x00CE, 0x00EE },
{ 0x00CF, 0x00EF },
{ 0x00D0, 0x00F0 },
{ 0x00D1, 0x00F1 },
{ 0x00D2, 0x00F2 },
{ 0x00D3, 0x00F3 },
{ 0x00D4, 0x00F4 },
{ 0x00D5, 0x00F5 },
{ 0x00D6, 0x00F6 },
{ 0x00D8, 0x00F8 },
{ 0x00D9, 0x00F9 },
{ 0x00DA, 0x00FA },
{ 0x00DB, 0x00FB },
{ 0x00DC, 0x00FC },
{ 0x00DD, 0x00FD },
{ 0x00DE, 0x00FE },
static const char32_t reverse_caps_table[][2] = {
{ 0x0100, 0x0101 },
{ 0x0102, 0x0103 },
{ 0x0104, 0x0105 },
Expand Down Expand Up @@ -2979,44 +2863,4 @@ static const int reverse_caps_table[UTL_LEN][2] = {
{ 0x1E921, 0x1E943 },
};

static int _find_upper(int ch) {
int low = 0;
int high = LTU_LEN - 1;
int middle;

while (low <= high) {
middle = (low + high) / 2;

if (ch < caps_table[middle][0]) {
high = middle - 1; // Search low end of array.
} else if (caps_table[middle][0] < ch) {
low = middle + 1; // Search high end of array.
} else {
return caps_table[middle][1];
}
}

return ch;
}

static int _find_lower(int ch) {
int low = 0;
int high = UTL_LEN - 1;
int middle;

while (low <= high) {
middle = (low + high) / 2;

if (ch < reverse_caps_table[middle][0]) {
high = middle - 1; // Search low end of array.
} else if (reverse_caps_table[middle][0] < ch) {
low = middle + 1; // Search high end of array.
} else {
return reverse_caps_table[middle][1];
}
}

return ch;
}

#endif // UCAPS_H
74 changes: 18 additions & 56 deletions misc/scripts/ucaps_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@
URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/UnicodeData.txt"


lower_to_upper: List[Tuple[str, str]] = []
upper_to_lower: List[Tuple[str, str]] = []


def parse_unicode_data() -> None:
def fetch_unicode_data() -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]]]:
lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]

lower_to_upper = []
upper_to_lower = []

for line in lines:
split_line: List[str] = line.split(";")

Expand All @@ -37,11 +36,15 @@ def parse_unicode_data() -> None:
if lowercase_mapping:
upper_to_lower.append((f"0x{code_value}", f"0x{lowercase_mapping}"))

return lower_to_upper, upper_to_lower

def make_cap_table(table_name: str, len_name: str, table: List[Tuple[str, str]]) -> str:
result: str = f"static const int {table_name}[{len_name}][2] = {{\n"

def make_cap_table(table_name: str, table: List[Tuple[str, str]], starting_from: int) -> str:
result: str = f"static const char32_t {table_name}[][2] = {{\n"

for first, second in table:
if int(first, 16) < starting_from:
continue
result += f"\t{{ {first}, {second} }},\n"

result += "};\n\n"
Expand All @@ -50,65 +53,24 @@ def make_cap_table(table_name: str, len_name: str, table: List[Tuple[str, str]])


def generate_ucaps_fetch() -> None:
parse_unicode_data()
lower_to_upper, upper_to_lower = fetch_unicode_data()

source: str = generate_copyright_header("ucaps.h")

source += f"""
source += """
#ifndef UCAPS_H
#define UCAPS_H
// This file was generated using the `misc/scripts/ucaps_fetch.py` script.
#define LTU_LEN {len(lower_to_upper)}
#define UTL_LEN {len(upper_to_lower)}\n\n"""

source += make_cap_table("caps_table", "LTU_LEN", lower_to_upper)
source += make_cap_table("reverse_caps_table", "UTL_LEN", upper_to_lower)

source += """static int _find_upper(int ch) {
\tint low = 0;
\tint high = LTU_LEN - 1;
\tint middle;
\twhile (low <= high) {
\t\tmiddle = (low + high) / 2;
\t\tif (ch < caps_table[middle][0]) {
\t\t\thigh = middle - 1; // Search low end of array.
\t\t} else if (caps_table[middle][0] < ch) {
\t\t\tlow = middle + 1; // Search high end of array.
\t\t} else {
\t\t\treturn caps_table[middle][1];
\t\t}
\t}
\treturn ch;
}
static int _find_lower(int ch) {
\tint low = 0;
\tint high = UTL_LEN - 1;
\tint middle;
\twhile (low <= high) {
\t\tmiddle = (low + high) / 2;
\t\tif (ch < reverse_caps_table[middle][0]) {
\t\t\thigh = middle - 1; // Search low end of array.
\t\t} else if (reverse_caps_table[middle][0] < ch) {
\t\t\tlow = middle + 1; // Search high end of array.
\t\t} else {
\t\t\treturn reverse_caps_table[middle][1];
\t\t}
\t}
\treturn ch;
}
#endif // UCAPS_H
"""

# We skip the lower bit characters because they are handled with a manual if statement.
source += make_cap_table("caps_table", lower_to_upper, starting_from=0x00FF)
source += make_cap_table("reverse_caps_table", upper_to_lower, starting_from=0x0100)

source += "#endif // UCAPS_H\n"

ucaps_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/ucaps.h")
with open(ucaps_path, "w", newline="\n") as f:
f.write(source)
Expand Down

0 comments on commit 97a14e5

Please sign in to comment.