From 6a857a10780e2c15266cc5de3731e17f8989f4d0 Mon Sep 17 00:00:00 2001 From: Nordine Bittich Date: Tue, 26 Nov 2024 20:13:47 +0100 Subject: [PATCH] iri (wip) --- src/iri.rs | 80 ++++++++++++++++++++++++++++++++- src/iri_spect.txt | 111 ++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 +- 3 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 src/iri_spect.txt diff --git a/src/iri.rs b/src/iri.rs index 989c220..268fd69 100644 --- a/src/iri.rs +++ b/src/iri.rs @@ -78,11 +78,89 @@ mod ip { )(s) } } -#[cfg(test)] +struct IRI { + scheme: Option, + i_hier_part: Option, +} + +struct IHierPart { + authority: Option, +} + +struct Authority { + user_info: Option, + host: Option, + port: Option, +} + +enum Host { + IPV4(Vec), + IPV6(Vec), + RegName(String), +} + +mod parser { + use crate::prelude::*; + fn parse_scheme(s: &str) -> ParserResult<&str> { + verify( + terminated( + take_while1(|c: char| c.is_alphanumeric() || c == '.' || c == '-' || c == '+'), + tag(":"), + ), + |scheme: &str| scheme.starts_with(|c: char| c.is_alphabetic()), + )(s) + } + fn parse_userinfo(s: &str) -> ParserResult<&str> { + terminated( + terminated( + take_while1(|c| c != ':'), + opt(preceded(tag(":"), take_while1(|c| c != '@'))), // skip the password + ), + tag("@"), + )(s) + } + fn parse_i_unreserved(s: &str) -> ParserResult<&str> { + fn is_ucs_char(c: char) -> bool { + c >= '\u{A0}' && c <= '\u{10FFFF}' + } + take_while1(|c: char| { + c.is_alphanum() || c == '-' || c == '.' || c == '_' || c == '~' || is_ucs_char(c) + })(s) + } + fn parse_sub_delims(s: &str) -> ParserResult<&str> { + take_while1(|c| { + c == '!' + || c == '$' + || c == '&' + || c == '\'' + || c == '(' + || c == ')' + || c == '*' + || c == '+' + || c == ',' + || c == ';' + || c == '=' + })(s) + } + fn parse_pct_encoded(s: &str) -> ParserResult<&str> { + preceded( + tag("%"), + verify(take(2usize), |hex: &str| { + hex.chars().all(|c| c.is_ascii_hexdigit()) + }), + )(s) + } +} +#[cfg(test)] mod test { + use crate::iri::ip::{parse_ip_v4, parse_ip_v6}; + #[test] + fn test_hex_st_to_char() { + println!("{}", u8::from_str_radix("3A", 16).unwrap() as char); + } #[test] fn parse_ip_v4_test() { assert_eq!( diff --git a/src/iri_spect.txt b/src/iri_spect.txt new file mode 100644 index 0000000..02085ba --- /dev/null +++ b/src/iri_spect.txt @@ -0,0 +1,111 @@ +FC 3987 Internationalized Resource Identifiers January 2005 + + + / ipath-noscheme + / ipath-empty + + iauthority = [ iuserinfo "@" ] ihost [ ":" port ] + iuserinfo = *( iunreserved / ncoded / sub-delims / ":" ) + ihost = IP-literal / IPv4address / ireg-name + + ireg-name = *( iunreserved / pct-encoded / sub-delims ) + + ipath = ipath-abempty ; begins with "/" or is empty + / ipath-absolute ; begins with "/" but not "//" + / ipath-noscheme ; begins with a non-colon segment + / ipath-rootless ; begins with a segment + / ipath-empty ; zero characters + + ipath-abempty = *( "/" isegment ) + ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ] + ipath-noscheme = isegment-nz-nc *( "/" isegment ) + ipath-rootless = isegment-nz *( "/" isegment ) + ipath-empty = 0 + + isegment = *ipchar + isegment-nz = 1*ipchar + isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims + / "@" ) + ; non-zero-length segment without any colon ":" + + ipchar = iunreserved / pct-encoded / sub-delims / ":" + / "@" + + iquery = *( ipchar / iprivate / "/" / "?" ) + + ifragment = *( ipchar / "/" / "?" ) + + + + iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD + + Some productions are ambiguous. The "first-match-wins" (a.k.a. + "greedy") algorithm applies. For details, see [RFC3986]. + + + + +Duerst & Suignard Standards Track [Page 8] + +RFC 3987 Internationalized Resource Identifiers January 2005 + + + The following rules are the same as those in [RFC3986]: + + + + port = *DIGIT + + IP-literal = "[" ( IPv6address / IPvFuture ) "]" + + IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + + + + + + + + + unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + reserved = gen-delims / sub-delims + gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + + + + + +DONE: + sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + / "*" / "+" / "," / ";" / "=" + + pct-encoded = "%" HEXDIG HEXDIG + iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar + + ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF + / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD + / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD + / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD + / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD + / %xD0000-DFFFD / %xE1000-EFFFD + + scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + dec-octet = DIGIT ; 0-9 + / %x31-39 DIGIT ; 10-99 + / "1" 2DIGIT ; 100-199 + / "2" %x30-34 DIGIT ; 200-249 + / "25" %x30-35 ; 250-255 + + IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + + IPv6address = 6( h16 ":" ) ls32 + / "::" 5( h16 ":" ) ls32 + / [ h16 ] "::" 4( h16 ":" ) ls32 + / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + / [ *4( h16 ":" ) h16 ] "::" ls32 + / [ *5( h16 ":" ) h16 ] "::" h16 + / [ *6( h16 ":" ) h16 ] "::" + h16 = 1*4HEXDIG + ls32 = ( h16 ":" h16 ) / IPv4address diff --git a/src/lib.rs b/src/lib.rs index d201498..fb8da27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,7 @@ pub mod prelude { is_alphanumeric, is_space, }, combinator::{ - all_consuming, cut, eof, map, map_parser, map_res, opt, peek, recognize, value, + all_consuming, cut, eof, map, map_parser, map_res, opt, peek, recognize, value, verify, }, error::{make_error, Error, ErrorKind}, multi::{many0, separated_list0, separated_list1},