From 6c754f4adc2ac90dc38a6c2f499d0046416552da Mon Sep 17 00:00:00 2001 From: Adithya Kumar Date: Tue, 25 Jul 2023 01:38:07 +0530 Subject: [PATCH] Expose incremental UTF8 decoding APIs --- core/src/Streamly/Internal/Unicode/Stream.hs | 34 +++++++++++++++++--- core/src/Streamly/Unicode/Stream.hs | 7 ++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/core/src/Streamly/Internal/Unicode/Stream.hs b/core/src/Streamly/Internal/Unicode/Stream.hs index f6847bf7d8..91e898551d 100644 --- a/core/src/Streamly/Internal/Unicode/Stream.hs +++ b/core/src/Streamly/Internal/Unicode/Stream.hs @@ -190,8 +190,25 @@ encodeLatin1Lax = encodeLatin1 -- UTF-8 decoding ------------------------------------------------------------------------------- --- Int helps in cheaper conversion from Int to Char +-- | CodePoint represents a specific character in the Unicode standard. The +-- code point is a numerical value assigned to each character, and UTF-8 +-- encoding uses a variable number of bytes to represent different code points. +-- +-- Calculate the code point value: Depending on the type of the leading byte, +-- extract the significant bits from each byte of the sequence and combine them +-- to form the complete code point value. The specific bit manipulations will +-- differ based on the number of bytes used. type CodePoint = Int + +-- | DecodeState refers to the number of bytes remaining to complete the current +-- UTF-8 character decoding. For ASCII characters (code points 0 to 127), no +-- decoding state is necessary because they are represented by a single byte. +-- Therefore, the decoding state for ASCII characters can be considered as 0. +-- +-- For multi-byte characters, the decoding state indicates the number of bytes +-- remaining to complete the character. It is usually initialized to a non-zero +-- value corresponding to the number of bytes in the multi-byte character, e.g +-- DecodeState will be 1 for 2-bytes char. type DecodeState = Word8 -- We can divide the errors in three general categories: @@ -410,17 +427,24 @@ decodeUtf8EitherD :: Monad m => D.Stream m Word8 -> D.Stream m (Either DecodeError Char) decodeUtf8EitherD = resumeDecodeUtf8EitherD 0 0 --- | +-- | Decode a bytestream as UTF-8 encoded characters, returning an 'Either' +-- stream. +-- +-- This function is similar to 'decodeUtf8', but instead of replacing the +-- invalid codepoint encountered, it returns a 'Left' 'DecodeError'. +-- +-- When decoding is successful and a valid character is encountered, the +-- function returns 'Right Char'. -- --- /Pre-release/ {-# INLINE decodeUtf8Either #-} decodeUtf8Either :: Monad m => Stream m Word8 -> Stream m (Either DecodeError Char) decodeUtf8Either = decodeUtf8EitherD --- | +-- | Resuming the decoding of a bytestream given a 'DecodeState' and a +-- 'CodePoint'. -- --- /Pre-release/ +-- >>> decodeUtf8Either = resumeDecodeUtf8Either 0 0 {-# INLINE resumeDecodeUtf8Either #-} resumeDecodeUtf8Either :: Monad m diff --git a/core/src/Streamly/Unicode/Stream.hs b/core/src/Streamly/Unicode/Stream.hs index c766821204..8cedb93507 100644 --- a/core/src/Streamly/Unicode/Stream.hs +++ b/core/src/Streamly/Unicode/Stream.hs @@ -81,6 +81,13 @@ module Streamly.Unicode.Stream , decodeUtf8' , decodeUtf8Chunks + -- ** Resumable UTF-8 Decoding + , DecodeError(..) + , DecodeState + , CodePoint + , decodeUtf8Either + , resumeDecodeUtf8Either + -- * Elimination (Encoding) , encodeLatin1 , encodeLatin1'