From b6f0f5fc2eb422d47546f388443c1725b85c728a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 25 Dec 2024 19:32:13 +0800 Subject: [PATCH] Support removing invalid utf-8 sequences. (#1648) --- sherpa-onnx/csrc/CMakeLists.txt | 1 + sherpa-onnx/csrc/offline-recognizer-impl.cc | 2 + sherpa-onnx/csrc/online-recognizer-impl.cc | 2 + sherpa-onnx/csrc/text-utils-test.cc | 50 +++++++++ sherpa-onnx/csrc/text-utils.cc | 106 ++++++++++++++++++++ sherpa-onnx/csrc/text-utils.h | 3 + 6 files changed, 164 insertions(+) create mode 100644 sherpa-onnx/csrc/text-utils-test.cc diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt index 6bfcd2a98..3850c8eb3 100644 --- a/sherpa-onnx/csrc/CMakeLists.txt +++ b/sherpa-onnx/csrc/CMakeLists.txt @@ -545,6 +545,7 @@ if(SHERPA_ONNX_ENABLE_TESTS) pad-sequence-test.cc slice-test.cc stack-test.cc + text-utils-test.cc text2token-test.cc transpose-test.cc unbind-test.cc diff --git a/sherpa-onnx/csrc/offline-recognizer-impl.cc b/sherpa-onnx/csrc/offline-recognizer-impl.cc index b3789849c..1867bf39b 100644 --- a/sherpa-onnx/csrc/offline-recognizer-impl.cc +++ b/sherpa-onnx/csrc/offline-recognizer-impl.cc @@ -488,6 +488,8 @@ OfflineRecognizerImpl::OfflineRecognizerImpl( std::string OfflineRecognizerImpl::ApplyInverseTextNormalization( std::string text) const { + text = RemoveInvalidUtf8Sequences(text); + if (!itn_list_.empty()) { for (const auto &tn : itn_list_) { text = tn->Normalize(text); diff --git a/sherpa-onnx/csrc/online-recognizer-impl.cc b/sherpa-onnx/csrc/online-recognizer-impl.cc index 27168b0f6..652ed2110 100644 --- a/sherpa-onnx/csrc/online-recognizer-impl.cc +++ b/sherpa-onnx/csrc/online-recognizer-impl.cc @@ -194,6 +194,8 @@ OnlineRecognizerImpl::OnlineRecognizerImpl(Manager *mgr, std::string OnlineRecognizerImpl::ApplyInverseTextNormalization( std::string text) const { + text = RemoveInvalidUtf8Sequences(text); + if (!itn_list_.empty()) { for (const auto &tn : itn_list_) { text = tn->Normalize(text); diff --git a/sherpa-onnx/csrc/text-utils-test.cc b/sherpa-onnx/csrc/text-utils-test.cc new file mode 100644 index 000000000..15558f166 --- /dev/null +++ b/sherpa-onnx/csrc/text-utils-test.cc @@ -0,0 +1,50 @@ +// sherpa-onnx/csrc/text-utils-test.cc +// +// Copyright (c) 2024 Xiaomi Corporation + +#include "sherpa-onnx/csrc/text-utils.h" + +#include "gtest/gtest.h" + +namespace sherpa_onnx { + +TEST(RemoveInvalidUtf8Sequences, Case1) { + std::vector v = { + 0xe4, 0xbb, 0x8a, // 今 + 0xe5, 0xa4, 0xa9, // 天 + 'i', 's', ' ', 'M', 'o', 'd', 'a', 'y', ',', // is Monday, + ' ', 'w', 'i', 'e', ' ', 'h', 'e', 'i', 0xc3, // wie heißen Size + 0x9f, 'e', 'n', ' ', 'S', 'i', 'e', 0xf0, 0x9d, 0x84, 0x81}; + + std::vector v0 = v; + v0[1] = 0xc0; // make the first 3 bytes an invalid utf8 character + std::string s0{v0.begin(), v0.end()}; + EXPECT_EQ(s0.size(), v0.size()); + + auto s = RemoveInvalidUtf8Sequences(s0); // should remove 今 + + v0 = v; + // v0[23] == 0xc3 + // v0[24] == 0x9f + + v0[23] = 0xc1; + + s0 = {v0.begin(), v0.end()}; + s = RemoveInvalidUtf8Sequences(s0); // should remove ß + + EXPECT_EQ(s.size() + 2, v.size()); + + v0 = v; + // v0[31] = 0xf0; + // v0[32] = 0x9d; + // v0[33] = 0x84; + // v0[34] = 0x81; + v0[31] = 0xf5; + + s0 = {v0.begin(), v0.end()}; + s = RemoveInvalidUtf8Sequences(s0); + + EXPECT_EQ(s.size() + 4, v.size()); +} + +} // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/text-utils.cc b/sherpa-onnx/csrc/text-utils.cc index 3f12e1460..7259ed7c4 100644 --- a/sherpa-onnx/csrc/text-utils.cc +++ b/sherpa-onnx/csrc/text-utils.cc @@ -396,4 +396,110 @@ void ToLowerCase(std::string *in_out) { [](unsigned char c) { return std::tolower(c); }); } +static inline bool InRange(uint8_t x, uint8_t low, uint8_t high) { + return low <= x && x <= high; +} + +/* +Please see +https://stackoverflow.com/questions/6555015/check-for-invalid-utf8 + + +Table 3-7. Well-Formed UTF-8 Byte Sequences + +Code Points First Byte Second Byte Third Byte Fourth Byte +U+0000..U+007F 00..7F +U+0080..U+07FF C2..DF 80..BF +U+0800..U+0FFF E0 A0..BF 80..BF +U+1000..U+CFFF E1..EC 80..BF 80..BF +U+D000..U+D7FF ED 80..9F 80..BF +U+E000..U+FFFF EE..EF 80..BF 80..BF +U+10000..U+3FFFF F0 90..BF 80..BF 80..BF +U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF +U+100000..U+10FFFF F4 80..8F 80..BF 80..BF + */ +std::string RemoveInvalidUtf8Sequences(const std::string &text, + bool show_debug_msg /*= false*/) { + int32_t n = static_cast(text.size()); + + std::string ans; + ans.reserve(n); + + int32_t i = 0; + const uint8_t *p = reinterpret_cast(text.data()); + while (i < n) { + if (p[i] <= 0x7f) { + ans.append(text, i, 1); + i += 1; + continue; + } + + if (InRange(p[i], 0xc2, 0xdf) && i + 1 < n && + InRange(p[i + 1], 0x80, 0xbf)) { + ans.append(text, i, 2); + i += 2; + continue; + } + + if (p[i] == 0xe0 && i + 2 < n && InRange(p[i + 1], 0xa0, 0xbf) && + InRange(p[i + 2], 0x80, 0xbf)) { + ans.append(text, i, 3); + i += 3; + continue; + } + + if (InRange(p[i], 0xe1, 0xec) && i + 2 < n && + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) { + ans.append(text, i, 3); + i += 3; + continue; + } + + if (p[i] == 0xed && i + 2 < n && InRange(p[i + 1], 0x80, 0x9f) && + InRange(p[i + 2], 0x80, 0xbf)) { + ans.append(text, i, 3); + i += 3; + continue; + } + + if (InRange(p[i], 0xee, 0xef) && i + 2 < n && + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf)) { + ans.append(text, i, 3); + i += 3; + continue; + } + + if (p[i] == 0xf0 && i + 3 < n && InRange(p[i + 1], 0x90, 0xbf) && + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) { + ans.append(text, i, 4); + i += 4; + continue; + } + + if (InRange(p[i], 0xf1, 0xf3) && i + 3 < n && + InRange(p[i + 1], 0x80, 0xbf) && InRange(p[i + 2], 0x80, 0xbf) && + InRange(p[i + 3], 0x80, 0xbf)) { + ans.append(text, i, 4); + i += 4; + continue; + } + + if (p[i] == 0xf4 && i + 3 < n && InRange(p[i + 1], 0x80, 0x8f) && + InRange(p[i + 2], 0x80, 0xbf) && InRange(p[i + 3], 0x80, 0xbf)) { + ans.append(text, i, 4); + i += 4; + continue; + } + + if (show_debug_msg) { + SHERPA_ONNX_LOGE("Ignore invalid utf8 sequence at pos: %d, value: %02x", + i, p[i]); + } + + i += 1; + } + + return ans; +} + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/text-utils.h b/sherpa-onnx/csrc/text-utils.h index a0b968d8a..a27137060 100644 --- a/sherpa-onnx/csrc/text-utils.h +++ b/sherpa-onnx/csrc/text-utils.h @@ -124,6 +124,9 @@ std::vector SplitUtf8(const std::string &text); std::string ToLowerCase(const std::string &s); void ToLowerCase(std::string *in_out); +std::string RemoveInvalidUtf8Sequences(const std::string &text, + bool show_debug_msg = false); + } // namespace sherpa_onnx #endif // SHERPA_ONNX_CSRC_TEXT_UTILS_H_