Merge pull request #39 from project-tsurugi/feat-truncate-utf8

feat: introduce `takatori::value::truncate_utf8`.
project-tsurugi · Aug 27, 2024 · f2c7268 · f2c7268
2 parents 89801e4 + a876b41
commit f2c7268
Show file tree

Hide file tree

Showing 4 changed files with 290 additions and 0 deletions.
diff --git a/include/takatori/value/character.h b/include/takatori/value/character.h
@@ -95,6 +95,29 @@ bool operator!=(character const& a, character const& b) noexcept;
  */
 std::ostream& operator<<(std::ostream& out, character const& value);
 
+/**
+ * @brief truncates the given UTF-8 encoded string to the specified size in bytes.
+ * @details
+ *      This function ensures that the returned string is a well-formed UTF-8 sequence
+ *      only if the input string is also well-formed. It truncates the input string
+ *      such that the byte size does not exceed the specified limit, without splitting
+ *      multi-byte UTF-8 characters.
+ *
+ *      If the input is not a well-formed UTF-8 string, the function will truncate it
+ *      to a length up to the specified size (may be up to 3 bytes smaller).
+ *
+ *      This function does not modify the original string and returns a view into the
+ *      truncated portion.
+ *
+ *      If the input string is shorter than the specified size, the function returns
+ *      just the original string.
+ *
+ * @param str the input UTF-8 encoded string.
+ * @param size the maximum size in bytes.
+ * @return a view of the truncated string, which will be no larger than the specified size.
+ */
+std::string_view truncate_utf8(std::string_view str, std::size_t size);
+
 /**
  * @brief type_of for character.
  */

diff --git a/src/takatori/value/character.cpp b/src/takatori/value/character.cpp
@@ -50,4 +50,75 @@ std::ostream& operator<<(std::ostream& out, character const& value) {
     return out << "character(" << value.get() << ")";
 }
 
+std::string_view truncate_utf8(std::string_view str, std::size_t size) {
+    if (str.size() <= size) {
+        return str;
+    }
+
+    constexpr unsigned char mask1 = 0b1000'0000;
+    constexpr unsigned char mask2 = 0b1100'0000;
+    constexpr unsigned char mask3 = 0b1110'0000;
+    constexpr unsigned char mask4 = 0b1111'0000;
+    constexpr unsigned char mask5 = 0b1111'1000;
+
+    constexpr unsigned char body2 = 0b1000'0000;
+    constexpr unsigned char head1_1 = 0b0000'0000;
+    constexpr unsigned char head3_2 = 0b1100'0000;
+    constexpr unsigned char head4_3 = 0b1110'0000;
+    constexpr unsigned char head5_4 = 0b1111'0000;
+
+    constexpr std::size_t max_sequence_size = 4;
+
+    // first, we find for the head of the last UTF-8 sequence
+    // we don't need to search for the 4-byte UTF-8 sequence.
+    for (std::size_t backward_offset = 1; backward_offset <= max_sequence_size - 1; ++backward_offset) {
+        if (backward_offset > size) {
+            // no head of UTF-8 sequences found
+            break;
+        }
+        auto position = size - backward_offset;
+        auto byte = static_cast<unsigned char>(str[position]);
+
+        if ((byte & mask2) == body2) {
+            // found non-head UTF-8 element, continue to backward.
+            continue;
+        }
+
+        if ((byte & mask1) == head1_1) {
+            // found 1-byte UTF-8 sequence
+            return str.substr(0, position + 1);
+        }
+        if ((byte & mask3) == head3_2) {
+            // found 2-byte UTF-8 sequence
+            if (backward_offset >= 2) {
+                return str.substr(0, position + 2);
+            }
+            // truncate the current 2-byte UTF-8 sequence
+            return str.substr(0, position);
+        }
+        if ((byte & mask4) == head4_3) {
+            // found 3-byte UTF-8 sequence
+            if (backward_offset >= 3) {
+                return str.substr(0, position + 3);
+            }
+            // truncate the current 3-byte UTF-8 sequence
+            return str.substr(0, position);
+        }
+        if ((byte & mask5) == head5_4) {
+            // found 4-byte UTF-8 sequence
+            // truncate the current 4-byte UTF-8 sequence
+            return str.substr(0, position);
+        }
+        // We not found a head of UTF-8 sequence.
+        // It because one of the following reasons:
+        // 1. the input string is a well-formed UTF-8 string, and the last one is a 4-byte UTF-8 sequence.
+        // 2. the input string is not a well-formed UTF-8 string.
+        // Anyway, we can obtain the expected result by trim to the originally specified size.
+        break;
+    }
+
+    // if the head of the last UTF-8 sequence is not found, we just return the truncated string
+    return str.substr(0, size);
+}
+
 } // namespace takatori::value
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -47,6 +47,7 @@ add_test_executable(takatori/value/octet_value_test.cpp)
 add_test_executable(takatori/value/bit_value_test.cpp)
 add_test_executable(takatori/value/extension_value_test.cpp)
 add_test_executable(takatori/value/value_dispatch_test.cpp)
+add_test_executable(takatori/value/truncate_utf8_test.cpp)
 
 # decimal
 add_test_executable(takatori/decimal/triple_test.cpp)

diff --git a/test/takatori/value/truncate_utf8_test.cpp b/test/takatori/value/truncate_utf8_test.cpp
@@ -0,0 +1,195 @@
+#include <takatori/value/character.h>
+
+#include <gtest/gtest.h>
+
+#include <string_view>
+
+namespace takatori::value {
+
+class truncate_utf8_test : public ::testing::Test {};
+
+TEST_F(truncate_utf8_test, in_size) {
+    std::string_view input {
+            "01234",
+    };
+    auto r = truncate_utf8(input, 5);
+    EXPECT_EQ(r, input);
+}
+
+TEST_F(truncate_utf8_test, trim1_1b) {
+    std::string_view input {
+            "01234A",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 1));
+}
+
+TEST_F(truncate_utf8_test, trim1_2b_lo) {
+    std::string_view input {
+            "01234\xc2\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 2));
+}
+
+TEST_F(truncate_utf8_test, trim1_2b_hi) {
+    std::string_view input {
+            "01234\xdf\xbf",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 2));
+}
+
+TEST_F(truncate_utf8_test, trim1_3b_lo) {
+    std::string_view input {
+            "01234\xe0\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 3));
+}
+
+TEST_F(truncate_utf8_test, trim1_3b_hi) {
+    std::string_view input {
+            "01234\xef\xbf\xbf",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 3));
+}
+
+TEST_F(truncate_utf8_test, trim2_3b_lo) {
+    std::string_view input {
+            "01234\xe0\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 2);
+    EXPECT_EQ(r, input.substr(0, input.size() - 3));
+}
+
+TEST_F(truncate_utf8_test, trim2_3b_hi) {
+    std::string_view input {
+            "01234\xef\xbf\xbf",
+    };
+    auto r = truncate_utf8(input, input.size() - 2);
+    EXPECT_EQ(r, input.substr(0, input.size() - 3));
+}
+
+TEST_F(truncate_utf8_test, trim1_4b_lo) {
+    std::string_view input {
+            "01234\xf0\x80\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim1_4b_hi) {
+    std::string_view input {
+            "01234\xf4\xbf\xbf\xbf",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim2_4b_lo) {
+    std::string_view input {
+            "01234\xf0\x80\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 2);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim2_4b_hi) {
+    std::string_view input {
+            "01234\xf4\xbf\xbf\xbf",
+    };
+    auto r = truncate_utf8(input, input.size() - 2);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim3_4b_lo) {
+    std::string_view input {
+            "01234\xf0\x80\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 3);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim3_4b_hi) {
+    std::string_view input {
+            "01234\xf4\xbf\xbf\xbf",
+    };
+    auto r = truncate_utf8(input, input.size() - 3);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim1_broken_lo) {
+    std::string_view input {
+            "01234\x80\x80\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 1));
+}
+
+TEST_F(truncate_utf8_test, trim1_broken_hi) {
+    std::string_view input {
+            "01234\xff_",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 1));
+}
+
+TEST_F(truncate_utf8_test, trim2_broken_lo) {
+    std::string_view input {
+            "01234\x80\x80\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 2);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim2_broken_hi) {
+    std::string_view input {
+            "01234\xff__",
+    };
+    auto r = truncate_utf8(input, input.size() - 2);
+    EXPECT_EQ(r, input.substr(0, input.size() - 2));
+}
+
+TEST_F(truncate_utf8_test, trim3_broken_lo) {
+    std::string_view input {
+            "01234\x80\x80\x80\x80",
+    };
+    auto r = truncate_utf8(input, input.size() - 3);
+    EXPECT_EQ(r, input.substr(0, input.size() - 4));
+}
+
+TEST_F(truncate_utf8_test, trim3_broken_hi) {
+    std::string_view input {
+            "01234\xff___",
+    };
+    auto r = truncate_utf8(input, input.size() - 3);
+    EXPECT_EQ(r, input.substr(0, input.size() - 3));
+}
+
+TEST_F(truncate_utf8_test, trim1_shorten1b) {
+    std::string_view input {
+            "\x80_",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 1));
+}
+
+TEST_F(truncate_utf8_test, trim1_shorten2b) {
+    std::string_view input {
+            "\x80\x80_",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 1));
+}
+
+TEST_F(truncate_utf8_test, trim1_shorten3b) {
+    std::string_view input {
+            "\x80\x80\x80_",
+    };
+    auto r = truncate_utf8(input, input.size() - 1);
+    EXPECT_EQ(r, input.substr(0, input.size() - 1));
+}
+
+} // namespace takatori::value