Skip to content

Commit

Permalink
Merge pull request #39 from project-tsurugi/feat-truncate-utf8
Browse files Browse the repository at this point in the history
feat: introduce `takatori::value::truncate_utf8`.
  • Loading branch information
kuron99 authored Aug 27, 2024
2 parents 89801e4 + a876b41 commit f2c7268
Show file tree
Hide file tree
Showing 4 changed files with 290 additions and 0 deletions.
23 changes: 23 additions & 0 deletions include/takatori/value/character.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,29 @@ bool operator!=(character const& a, character const& b) noexcept;
*/
std::ostream& operator<<(std::ostream& out, character const& value);

/**
* @brief truncates the given UTF-8 encoded string to the specified size in bytes.
* @details
* This function ensures that the returned string is a well-formed UTF-8 sequence
* only if the input string is also well-formed. It truncates the input string
* such that the byte size does not exceed the specified limit, without splitting
* multi-byte UTF-8 characters.
*
* If the input is not a well-formed UTF-8 string, the function will truncate it
* to a length up to the specified size (may be up to 3 bytes smaller).
*
* This function does not modify the original string and returns a view into the
* truncated portion.
*
* If the input string is shorter than the specified size, the function returns
* just the original string.
*
* @param str the input UTF-8 encoded string.
* @param size the maximum size in bytes.
* @return a view of the truncated string, which will be no larger than the specified size.
*/
std::string_view truncate_utf8(std::string_view str, std::size_t size);

/**
* @brief type_of for character.
*/
Expand Down
71 changes: 71 additions & 0 deletions src/takatori/value/character.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,75 @@ std::ostream& operator<<(std::ostream& out, character const& value) {
return out << "character(" << value.get() << ")";
}

std::string_view truncate_utf8(std::string_view str, std::size_t size) {
if (str.size() <= size) {
return str;
}

constexpr unsigned char mask1 = 0b1000'0000;
constexpr unsigned char mask2 = 0b1100'0000;
constexpr unsigned char mask3 = 0b1110'0000;
constexpr unsigned char mask4 = 0b1111'0000;
constexpr unsigned char mask5 = 0b1111'1000;

constexpr unsigned char body2 = 0b1000'0000;
constexpr unsigned char head1_1 = 0b0000'0000;
constexpr unsigned char head3_2 = 0b1100'0000;
constexpr unsigned char head4_3 = 0b1110'0000;
constexpr unsigned char head5_4 = 0b1111'0000;

constexpr std::size_t max_sequence_size = 4;

// first, we find for the head of the last UTF-8 sequence
// we don't need to search for the 4-byte UTF-8 sequence.
for (std::size_t backward_offset = 1; backward_offset <= max_sequence_size - 1; ++backward_offset) {
if (backward_offset > size) {
// no head of UTF-8 sequences found
break;
}
auto position = size - backward_offset;
auto byte = static_cast<unsigned char>(str[position]);

if ((byte & mask2) == body2) {
// found non-head UTF-8 element, continue to backward.
continue;
}

if ((byte & mask1) == head1_1) {
// found 1-byte UTF-8 sequence
return str.substr(0, position + 1);
}
if ((byte & mask3) == head3_2) {
// found 2-byte UTF-8 sequence
if (backward_offset >= 2) {
return str.substr(0, position + 2);
}
// truncate the current 2-byte UTF-8 sequence
return str.substr(0, position);
}
if ((byte & mask4) == head4_3) {
// found 3-byte UTF-8 sequence
if (backward_offset >= 3) {
return str.substr(0, position + 3);
}
// truncate the current 3-byte UTF-8 sequence
return str.substr(0, position);
}
if ((byte & mask5) == head5_4) {
// found 4-byte UTF-8 sequence
// truncate the current 4-byte UTF-8 sequence
return str.substr(0, position);
}
// We not found a head of UTF-8 sequence.
// It because one of the following reasons:
// 1. the input string is a well-formed UTF-8 string, and the last one is a 4-byte UTF-8 sequence.
// 2. the input string is not a well-formed UTF-8 string.
// Anyway, we can obtain the expected result by trim to the originally specified size.
break;
}

// if the head of the last UTF-8 sequence is not found, we just return the truncated string
return str.substr(0, size);
}

} // namespace takatori::value
1 change: 1 addition & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ add_test_executable(takatori/value/octet_value_test.cpp)
add_test_executable(takatori/value/bit_value_test.cpp)
add_test_executable(takatori/value/extension_value_test.cpp)
add_test_executable(takatori/value/value_dispatch_test.cpp)
add_test_executable(takatori/value/truncate_utf8_test.cpp)

# decimal
add_test_executable(takatori/decimal/triple_test.cpp)
Expand Down
195 changes: 195 additions & 0 deletions test/takatori/value/truncate_utf8_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#include <takatori/value/character.h>

#include <gtest/gtest.h>

#include <string_view>

namespace takatori::value {

class truncate_utf8_test : public ::testing::Test {};

TEST_F(truncate_utf8_test, in_size) {
std::string_view input {
"01234",
};
auto r = truncate_utf8(input, 5);
EXPECT_EQ(r, input);
}

TEST_F(truncate_utf8_test, trim1_1b) {
std::string_view input {
"01234A",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 1));
}

TEST_F(truncate_utf8_test, trim1_2b_lo) {
std::string_view input {
"01234\xc2\x80",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 2));
}

TEST_F(truncate_utf8_test, trim1_2b_hi) {
std::string_view input {
"01234\xdf\xbf",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 2));
}

TEST_F(truncate_utf8_test, trim1_3b_lo) {
std::string_view input {
"01234\xe0\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 3));
}

TEST_F(truncate_utf8_test, trim1_3b_hi) {
std::string_view input {
"01234\xef\xbf\xbf",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 3));
}

TEST_F(truncate_utf8_test, trim2_3b_lo) {
std::string_view input {
"01234\xe0\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 2);
EXPECT_EQ(r, input.substr(0, input.size() - 3));
}

TEST_F(truncate_utf8_test, trim2_3b_hi) {
std::string_view input {
"01234\xef\xbf\xbf",
};
auto r = truncate_utf8(input, input.size() - 2);
EXPECT_EQ(r, input.substr(0, input.size() - 3));
}

TEST_F(truncate_utf8_test, trim1_4b_lo) {
std::string_view input {
"01234\xf0\x80\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim1_4b_hi) {
std::string_view input {
"01234\xf4\xbf\xbf\xbf",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim2_4b_lo) {
std::string_view input {
"01234\xf0\x80\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 2);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim2_4b_hi) {
std::string_view input {
"01234\xf4\xbf\xbf\xbf",
};
auto r = truncate_utf8(input, input.size() - 2);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim3_4b_lo) {
std::string_view input {
"01234\xf0\x80\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 3);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim3_4b_hi) {
std::string_view input {
"01234\xf4\xbf\xbf\xbf",
};
auto r = truncate_utf8(input, input.size() - 3);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim1_broken_lo) {
std::string_view input {
"01234\x80\x80\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 1));
}

TEST_F(truncate_utf8_test, trim1_broken_hi) {
std::string_view input {
"01234\xff_",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 1));
}

TEST_F(truncate_utf8_test, trim2_broken_lo) {
std::string_view input {
"01234\x80\x80\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 2);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim2_broken_hi) {
std::string_view input {
"01234\xff__",
};
auto r = truncate_utf8(input, input.size() - 2);
EXPECT_EQ(r, input.substr(0, input.size() - 2));
}

TEST_F(truncate_utf8_test, trim3_broken_lo) {
std::string_view input {
"01234\x80\x80\x80\x80",
};
auto r = truncate_utf8(input, input.size() - 3);
EXPECT_EQ(r, input.substr(0, input.size() - 4));
}

TEST_F(truncate_utf8_test, trim3_broken_hi) {
std::string_view input {
"01234\xff___",
};
auto r = truncate_utf8(input, input.size() - 3);
EXPECT_EQ(r, input.substr(0, input.size() - 3));
}

TEST_F(truncate_utf8_test, trim1_shorten1b) {
std::string_view input {
"\x80_",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 1));
}

TEST_F(truncate_utf8_test, trim1_shorten2b) {
std::string_view input {
"\x80\x80_",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 1));
}

TEST_F(truncate_utf8_test, trim1_shorten3b) {
std::string_view input {
"\x80\x80\x80_",
};
auto r = truncate_utf8(input, input.size() - 1);
EXPECT_EQ(r, input.substr(0, input.size() - 1));
}

} // namespace takatori::value

0 comments on commit f2c7268

Please sign in to comment.