Skip to content

Commit

Permalink
iri (wip)
Browse files Browse the repository at this point in the history
  • Loading branch information
nbittich committed Nov 26, 2024
1 parent 8c7ce70 commit 6a857a1
Show file tree
Hide file tree
Showing 3 changed files with 191 additions and 2 deletions.
80 changes: 79 additions & 1 deletion src/iri.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,89 @@ mod ip {
)(s)
}
}
#[cfg(test)]

struct IRI {
scheme: Option<String>,
i_hier_part: Option<IHierPart>,
}

struct IHierPart {
authority: Option<Authority>,
}

struct Authority {
user_info: Option<String>,
host: Option<Host>,
port: Option<String>,
}

enum Host {
IPV4(Vec<u8>),
IPV6(Vec<u8>),
RegName(String),
}

mod parser {
use crate::prelude::*;
fn parse_scheme(s: &str) -> ParserResult<&str> {
verify(
terminated(
take_while1(|c: char| c.is_alphanumeric() || c == '.' || c == '-' || c == '+'),
tag(":"),
),
|scheme: &str| scheme.starts_with(|c: char| c.is_alphabetic()),
)(s)
}
fn parse_userinfo(s: &str) -> ParserResult<&str> {
terminated(
terminated(
take_while1(|c| c != ':'),
opt(preceded(tag(":"), take_while1(|c| c != '@'))), // skip the password
),
tag("@"),
)(s)
}
fn parse_i_unreserved(s: &str) -> ParserResult<&str> {
fn is_ucs_char(c: char) -> bool {
c >= '\u{A0}' && c <= '\u{10FFFF}'
}
take_while1(|c: char| {
c.is_alphanum() || c == '-' || c == '.' || c == '_' || c == '~' || is_ucs_char(c)
})(s)
}
fn parse_sub_delims(s: &str) -> ParserResult<&str> {
take_while1(|c| {
c == '!'
|| c == '$'
|| c == '&'
|| c == '\''
|| c == '('
|| c == ')'
|| c == '*'
|| c == '+'
|| c == ','
|| c == ';'
|| c == '='
})(s)
}
fn parse_pct_encoded(s: &str) -> ParserResult<&str> {
preceded(
tag("%"),
verify(take(2usize), |hex: &str| {
hex.chars().all(|c| c.is_ascii_hexdigit())
}),
)(s)
}
}
#[cfg(test)]
mod test {

use crate::iri::ip::{parse_ip_v4, parse_ip_v6};

#[test]
fn test_hex_st_to_char() {
println!("{}", u8::from_str_radix("3A", 16).unwrap() as char);
}
#[test]
fn parse_ip_v4_test() {
assert_eq!(
Expand Down
111 changes: 111 additions & 0 deletions src/iri_spect.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
FC 3987 Internationalized Resource Identifiers January 2005


/ ipath-noscheme
/ ipath-empty

iauthority = [ iuserinfo "@" ] ihost [ ":" port ]
iuserinfo = *( iunreserved / ncoded / sub-delims / ":" )
ihost = IP-literal / IPv4address / ireg-name

ireg-name = *( iunreserved / pct-encoded / sub-delims )

ipath = ipath-abempty ; begins with "/" or is empty
/ ipath-absolute ; begins with "/" but not "//"
/ ipath-noscheme ; begins with a non-colon segment
/ ipath-rootless ; begins with a segment
/ ipath-empty ; zero characters

ipath-abempty = *( "/" isegment )
ipath-absolute = "/" [ isegment-nz *( "/" isegment ) ]
ipath-noscheme = isegment-nz-nc *( "/" isegment )
ipath-rootless = isegment-nz *( "/" isegment )
ipath-empty = 0<ipchar>

isegment = *ipchar
isegment-nz = 1*ipchar
isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
/ "@" )
; non-zero-length segment without any colon ":"

ipchar = iunreserved / pct-encoded / sub-delims / ":"
/ "@"

iquery = *( ipchar / iprivate / "/" / "?" )

ifragment = *( ipchar / "/" / "?" )



iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD

Some productions are ambiguous. The "first-match-wins" (a.k.a.
"greedy") algorithm applies. For details, see [RFC3986].




Duerst & Suignard Standards Track [Page 8]

RFC 3987 Internationalized Resource Identifiers January 2005


The following rules are the same as those in [RFC3986]:



port = *DIGIT

IP-literal = "[" ( IPv6address / IPvFuture ) "]"

IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )








unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
reserved = gen-delims / sub-delims
gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"





DONE:
sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
/ "*" / "+" / "," / ";" / "="

pct-encoded = "%" HEXDIG HEXDIG
iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar

ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
/ %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
/ %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
/ %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
/ %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
/ %xD0000-DFFFD / %xE1000-EFFFD

scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
dec-octet = DIGIT ; 0-9
/ %x31-39 DIGIT ; 10-99
/ "1" 2DIGIT ; 100-199
/ "2" %x30-34 DIGIT ; 200-249
/ "25" %x30-35 ; 250-255

IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet

IPv6address = 6( h16 ":" ) ls32
/ "::" 5( h16 ":" ) ls32
/ [ h16 ] "::" 4( h16 ":" ) ls32
/ [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
/ [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
/ [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
/ [ *4( h16 ":" ) h16 ] "::" ls32
/ [ *5( h16 ":" ) h16 ] "::" h16
/ [ *6( h16 ":" ) h16 ] "::"
h16 = 1*4HEXDIG
ls32 = ( h16 ":" h16 ) / IPv4address
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub mod prelude {
is_alphanumeric, is_space,
},
combinator::{
all_consuming, cut, eof, map, map_parser, map_res, opt, peek, recognize, value,
all_consuming, cut, eof, map, map_parser, map_res, opt, peek, recognize, value, verify,
},
error::{make_error, Error, ErrorKind},
multi::{many0, separated_list0, separated_list1},
Expand Down

0 comments on commit 6a857a1

Please sign in to comment.