From 8e921ec5ad2b0f11b0c436581b91586f91e61747 Mon Sep 17 00:00:00 2001 From: Nordine Bittich Date: Wed, 27 Nov 2024 19:25:58 +0100 Subject: [PATCH] iri wip --- src/iri.rs | 98 +++++++++++++++++++++++++++++++++++------------ src/iri_spect.txt | 8 ++-- 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/src/iri.rs b/src/iri.rs index 268fd69..7125d34 100644 --- a/src/iri.rs +++ b/src/iri.rs @@ -100,7 +100,14 @@ enum Host { RegName(String), } +#[allow(unused)] mod parser { + use nom::{ + character::complete::anychar, + error::{ParseError, VerboseError}, + multi::{fold_many1, many1}, + }; + use crate::prelude::*; fn parse_scheme(s: &str) -> ParserResult<&str> { verify( @@ -111,36 +118,73 @@ mod parser { |scheme: &str| scheme.starts_with(|c: char| c.is_alphabetic()), )(s) } - fn parse_userinfo(s: &str) -> ParserResult<&str> { - terminated( - terminated( - take_while1(|c| c != ':'), - opt(preceded(tag(":"), take_while1(|c| c != '@'))), // skip the password - ), - tag("@"), + fn parse_userinfo(s: &str) -> ParserResult { + fold_many1( + alt(( + parse_pct_encoded, + parse_i_unreserved, + parse_sub_delims, + tag(":"), + )), + String::new, + |mut acc: String, item| { + acc.push_str(item); + acc + }, + )(s) + } + fn parse_i_reg_name(s: &str) -> ParserResult { + fold_many1( + alt((parse_pct_encoded, parse_i_unreserved, parse_sub_delims)), + String::new, + |mut acc: String, item| { + acc.push_str(item); + acc + }, )(s) } fn parse_i_unreserved(s: &str) -> ParserResult<&str> { - fn is_ucs_char(c: char) -> bool { - c >= '\u{A0}' && c <= '\u{10FFFF}' + fn is_ucs_char(c: &char) -> bool { + matches!(c, + '\u{00A0}'..='\u{D7FF}' | + '\u{F900}'..='\u{FDCF}' | + '\u{FDF0}'..='\u{FFEF}' | + '\u{10000}'..='\u{1FFFD}' | + '\u{20000}'..='\u{2FFFD}' | + '\u{30000}'..='\u{3FFFD}' | + '\u{40000}'..='\u{4FFFD}' | + '\u{50000}'..='\u{5FFFD}' | + '\u{60000}'..='\u{6FFFD}' | + '\u{70000}'..='\u{7FFFD}' | + '\u{80000}'..='\u{8FFFD}' | + '\u{90000}'..='\u{9FFFD}' | + '\u{A0000}'..='\u{AFFFD}' | + '\u{B0000}'..='\u{BFFFD}' | + '\u{C0000}'..='\u{CFFFD}' | + '\u{D0000}'..='\u{DFFFD}' | + '\u{E1000}'..='\u{EFFFD}' + ) } - take_while1(|c: char| { - c.is_alphanum() || c == '-' || c == '.' || c == '_' || c == '~' || is_ucs_char(c) - })(s) + alt(( + verify(take(2usize), |hex: &str| { + hex_to_char(hex).filter(|x| is_ucs_char(x)).is_some() + }), + take_while1(|c: char| c.is_alphanum() || c == '-' || c == '.' || c == '_' || c == '~'), + ))(s) } fn parse_sub_delims(s: &str) -> ParserResult<&str> { - take_while1(|c| { - c == '!' - || c == '$' - || c == '&' - || c == '\'' - || c == '(' - || c == ')' - || c == '*' - || c == '+' - || c == ',' - || c == ';' - || c == '=' + verify(take(1usize), |c: &str| { + c == "!" + || c == "$" + || c == "&" + || c == "'" + || c == "(" + || c == ")" + || c == "*" + || c == "+" + || c == "," + || c == ";" + || c == "=" })(s) } fn parse_pct_encoded(s: &str) -> ParserResult<&str> { @@ -151,7 +195,13 @@ mod parser { }), )(s) } + fn hex_to_char(hex: &str) -> Option { + u32::from_str_radix(hex, 16) + .ok() + .and_then(|u| char::from_u32(u)) + } } + #[cfg(test)] mod test { diff --git a/src/iri_spect.txt b/src/iri_spect.txt index 02085ba..55da51e 100644 --- a/src/iri_spect.txt +++ b/src/iri_spect.txt @@ -5,10 +5,9 @@ FC 3987 Internationalized Resource Identifiers January 2005 / ipath-empty iauthority = [ iuserinfo "@" ] ihost [ ":" port ] - iuserinfo = *( iunreserved / ncoded / sub-delims / ":" ) + ihost = IP-literal / IPv4address / ireg-name - ireg-name = *( iunreserved / pct-encoded / sub-delims ) ipath = ipath-abempty ; begins with "/" or is empty / ipath-absolute ; begins with "/" but not "//" @@ -108,4 +107,7 @@ DONE: / [ *5( h16 ":" ) h16 ] "::" h16 / [ *6( h16 ":" ) h16 ] "::" h16 = 1*4HEXDIG - ls32 = ( h16 ":" h16 ) / IPv4address +ls32 = ( h16 ":" h16 ) / IPv4address + + ireg-name = *( iunreserved / pct-encoded / sub-delims ) + iuserinfo = *( iunreserved / ncoded / sub-delims / ":" )