Skip to content

Commit

Permalink
Implement custom parser (#4)
Browse files Browse the repository at this point in the history
* WIP

* WIP

* Use startest

* WIP

* WIP

* WIP

* WIP

* WIP

* Document custom error types

* Use `result.replace()` instead of `result.map()`

* Update examples

* Refactor

* Refactor

* Use `result.lazy_or`, as it is a bit cleaner if error is not needed
  • Loading branch information
JosephTLyons authored Oct 17, 2024
1 parent be2c373 commit 3157af0
Show file tree
Hide file tree
Showing 6 changed files with 344 additions and 101 deletions.
198 changes: 198 additions & 0 deletions src/coerce.gleam
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import gleam/bool
import gleam/list
import gleam/option.{type Option, None, Some}
import gleam/result
import gleam/set.{type Set}
import gleam/string

pub type ParseError {
/// Represents an error when an invalid character is encountered during
/// parsing. The `String` parameter contains the invalid character.
InvalidCharacter(String)

/// Represents an error when the input string is empty or contains only
/// whitespace.
WhitespaceOnlyOrEmptyString

/// Represents an error when an underscore is in an invalid position within
/// the number string.
InvalidUnderscorePosition

/// Represents an error when a decimal point is in an invalid position within
/// the number string.
InvalidDecimalPosition

/// Represents an error when Gleam's `float.parse` fails after custom parsing
/// and coercion. Indicates the string couldn't be converted to a float even
/// with more permissive rules.
GleamFloatParseError

/// Represents an error when Gleam's `int.parse` fails after custom parsing
/// and coercion. Indicates the string couldn't be converted to a float even
/// with more permissive rules.
GleamIntParseError
}

@internal
pub fn coerce_into_valid_number_string(
text: String,
) -> Result(String, ParseError) {
let text = text |> string.trim
use <- bool.guard(text |> string.is_empty, Error(WhitespaceOnlyOrEmptyString))
use _ <- result.try(text |> has_valid_characters())
use text <- result.try(text |> coerce_into_valid_underscore_string)
text |> coerce_into_valid_decimal_string
}

fn digit_set() -> Set(String) {
["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"] |> set.from_list
}

fn sign_set() -> Set(String) {
["+", "-"] |> set.from_list
}

fn separator_set() -> Set(String) {
[".", "_"] |> set.from_list
}

fn valid_character_set() -> Set(String) {
let digits = digit_set()
let signs = sign_set()
let separators = separator_set()

digits |> set.union(signs) |> set.union(separators)
}

@internal
pub fn coerce_into_valid_underscore_string(
text: String,
) -> Result(String, ParseError) {
text
|> string.to_graphemes
|> do_coerce_into_valid_underscore_string(
previous: None,
digits: digit_set(),
acc: "",
)
}

fn do_coerce_into_valid_underscore_string(
characters: List(String),
previous previous: Option(String),
digits digits: Set(String),
acc acc: String,
) -> Result(String, ParseError) {
case characters {
[] -> {
use <- bool.guard(previous == Some("_"), Error(InvalidUnderscorePosition))
Ok(acc |> string.reverse)
}
[first, ..rest] -> {
case first, previous {
"_", None -> Error(InvalidUnderscorePosition)
a, Some("_") ->
case digits |> set.contains(a) {
True ->
do_coerce_into_valid_underscore_string(
rest,
previous: Some(first),
digits: digits,
acc: first <> acc,
)
False -> Error(InvalidUnderscorePosition)
}
"_", Some(a) ->
case digits |> set.contains(a) {
True ->
do_coerce_into_valid_underscore_string(
rest,
previous: Some(first),
digits: digits,
acc: acc,
)
False -> Error(InvalidUnderscorePosition)
}
_, _ ->
do_coerce_into_valid_underscore_string(
rest,
previous: Some(first),
digits: digits,
acc: first <> acc,
)
}
}
}
}

@internal
pub fn has_valid_characters(text: String) -> Result(Nil, ParseError) {
let graphemes = text |> string.to_graphemes
list.try_map(graphemes, fn(grapheme) {
case valid_character_set() |> set.contains(grapheme) {
True -> Ok(Nil)
False -> Error(InvalidCharacter(grapheme))
}
})
|> result.replace(Nil)
}

@internal
pub fn coerce_into_valid_decimal_string(
text: String,
) -> Result(String, ParseError) {
let text_length = text |> string.length

text
|> string.to_graphemes
|> do_coerce_into_valid_decimal_string(
text_length: text_length,
previous: None,
seen_decimal: False,
acc: "",
)
}

fn do_coerce_into_valid_decimal_string(
characters: List(String),
text_length text_length: Int,
previous previous: Option(String),
seen_decimal seen_decimal: Bool,
acc acc: String,
) -> Result(String, ParseError) {
case characters {
[] -> {
case previous {
Some(".") -> Ok("0" <> acc)
_ -> Ok(acc)
}
|> result.map(string.reverse)
}
[first, ..rest] -> {
case first, previous {
".", None ->
case text_length == 1 {
True -> Error(InvalidDecimalPosition)
False ->
rest
|> do_coerce_into_valid_decimal_string(
text_length: text_length,
previous: Some(first),
seen_decimal: True,
acc: acc <> ".0",
)
}
".", Some(_) if seen_decimal -> Error(InvalidDecimalPosition)
a, _ -> {
rest
|> do_coerce_into_valid_decimal_string(
text_length: text_length,
previous: Some(first),
seen_decimal: a == "." || seen_decimal,
acc: first <> acc,
)
}
}
}
}
}
80 changes: 22 additions & 58 deletions src/lenient_parse.gleam
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import gleam/bool
import coerce.{
type ParseError, GleamFloatParseError, GleamIntParseError,
coerce_into_valid_number_string,
}
import gleam/float
import gleam/int
import gleam/regex
import gleam/result
import gleam/string

/// Converts a string to a float using a more lenient parsing method than gleam's `float.parse()`. It behaves similarly to Python's `float()` built-in function.
///
Expand All @@ -19,27 +20,21 @@ import gleam/string
/// lenient_parse.to_float("+123.321") // -> Ok(123.321)
/// lenient_parse.to_float("-123.321") // -> Ok(-123.321)
/// lenient_parse.to_float(" 1.0 ") // -> Ok(1.0)
/// lenient_parse.to_float("1_000.0") // -> Ok(1000.0)
/// lenient_parse.to_float(" ") // -> Error(Nil)
/// lenient_parse.to_float("") // -> Error(Nil)
/// lenient_parse.to_float("abc") // -> Error(Nil)
/// lenient_parse.to_float("1_000.0") // -> Ok(1.0e3)
/// lenient_parse.to_float(" ") // -> Error(WhitespaceOnlyOrEmptyString)
/// lenient_parse.to_float("") // -> Error(WhitespaceOnlyOrEmptyString)
/// lenient_parse.to_float("abc") // -> Error(InvalidCharacter("a"))
/// ```
pub fn to_float(text: String) -> Result(Float, Nil) {
use text <- result.try(text |> common_sanitize)
use _ <- result.try_recover(text |> float.parse)
use _ <- result.try_recover(text |> int.parse |> result.map(int.to_float))

let res = case string.first(text) {
Ok(".") -> float.parse("0" <> text)
_ -> Error(Nil)
}

pub fn to_float(text: String) -> Result(Float, ParseError) {
let text = text |> coerce_into_valid_number_string
use text <- result.try(text)
let res = text |> float.parse |> result.replace_error(GleamFloatParseError)
use <- result.lazy_or(res)

case string.last(text) {
Ok(".") -> float.parse(text <> "0")
_ -> Error(Nil)
}
text
|> int.parse
|> result.replace_error(GleamIntParseError)
|> result.map(int.to_float)
}

/// Converts a string to an integer using a more lenient parsing method than gleam's `int.parse()`.
Expand All @@ -54,42 +49,11 @@ pub fn to_float(text: String) -> Result(Float, Nil) {
/// lenient_parse.to_int("0123") // -> Ok(123)
/// lenient_parse.to_int(" 123 ") // -> Ok(123)
/// lenient_parse.to_int("1_000") // -> Ok(1000)
/// lenient_parse.to_int("") // -> Error(Nil)
/// lenient_parse.to_int("1.0") // -> Error(Nil)
/// lenient_parse.to_int("abc") // -> Error(Nil)
/// lenient_parse.to_int("") // -> Error(WhitespaceOnlyOrEmptyString)
/// lenient_parse.to_int("1.0") // -> Error(GleamIntParseError)
/// lenient_parse.to_int("abc") // -> Error(InvalidCharacter("a"))
/// ```
pub fn to_int(text: String) -> Result(Int, Nil) {
text |> common_sanitize |> result.try(int.parse)
}

fn common_sanitize(text: String) -> Result(String, Nil) {
use <- bool.guard(!is_valid_number_string(text), Error(Nil))
let text = text |> string.trim |> string.replace("_", "")
use <- bool.guard(text |> string.is_empty, Error(Nil))
text |> Ok
}

@internal
pub fn is_valid_number_string(text: String) -> Bool {
// ^ - Start of string
// \s* - Optional whitespace at the beginning
// [+-]? - Optional plus or minus sign
// (?!.*__) - Negative lookahead to prevent double underscores
// (?!_) - Negative lookahead to prevent leading underscore
// (?!^\s*[+-]?_\s*$) - Negative lookahead to prevent just an underscore
// [0-9_]* - Zero or more digits or underscores
// (?<!_) - Negative lookbehind to prevent trailing underscore before decimal point
// \.? - Optional decimal point
// (?!_) - Negative lookahead to prevent underscore immediately after decimal point
// [0-9_]* - Zero or more digits or underscores after decimal point
// (?<!_) - Negative lookbehind to prevent trailing underscore
// \s* - Optional whitespace at the end
// $ - End of string
let pattern =
"^\\s*[+-]?(?!_)(?!.*__)(?!^\\s*[+-]?_\\s*$)[0-9_]*(?<!_)\\.?(?!_)[0-9_]*(?<!_)\\s*$"

pattern
|> regex.from_string
|> result.map(regex.check(with: _, content: text))
|> result.unwrap(False)
pub fn to_int(text: String) -> Result(Int, ParseError) {
use text <- result.try(text |> coerce_into_valid_number_string)
text |> int.parse |> result.replace_error(GleamIntParseError)
}
Loading

0 comments on commit 3157af0

Please sign in to comment.