From ee2a636208d1ec900ad0ea698faa9e03e25b03f1 Mon Sep 17 00:00:00 2001 From: Justin Chang <37165464+just-in-chang@users.noreply.github.com> Date: Tue, 31 Oct 2023 16:58:29 -0700 Subject: [PATCH] [NFT Metadata Crawler] Increase URI Parser coverage for redirected IPFS public gateway URIs (#10743) * additional coverage * comments --- .../src/utils/uri_parser.rs | 43 +++++++++++++------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/ecosystem/nft-metadata-crawler-parser/src/utils/uri_parser.rs b/ecosystem/nft-metadata-crawler-parser/src/utils/uri_parser.rs index cb397442f464f..0bd06f71ca6fc 100644 --- a/ecosystem/nft-metadata-crawler-parser/src/utils/uri_parser.rs +++ b/ecosystem/nft-metadata-crawler-parser/src/utils/uri_parser.rs @@ -1,7 +1,7 @@ // Copyright © Aptos Foundation use crate::utils::counters::{PARSE_URI_INVOCATION_COUNT, PARSE_URI_TYPE_COUNT}; -use regex::Regex; +use regex::{Captures, Regex}; use url::Url; pub struct URIParser; @@ -25,24 +25,34 @@ impl URIParser { // Expects the following format for provided URIs `ipfs/{CID}/{path}` let re = Regex::new(r"^(ipfs/)(?P[a-zA-Z0-9]+)(?P/.*)?$")?; + // Expects the following format for provided URIs `https://{CID}.ipfs.com/{path}` + let redir_re = Regex::new(r"https:\/\/(?P[^\.]+)\.ipfs\.[^\/]+(?P\/.+)?")?; + let path = Url::parse(&modified_uri)? .path_segments() .map(|segments| segments.collect::>().join("/")); - if let Some(captures) = re.captures(&path.unwrap_or_default()) { - let cid = captures["cid"].to_string(); - let path = captures.name("path").map(|m| m.as_str().to_string()); - - PARSE_URI_TYPE_COUNT.with_label_values(&["ipfs"]).inc(); - Ok(format!( - "{}{}{}", - ipfs_prefix, - cid, - path.unwrap_or_default() - )) - } else { - Err(anyhow::anyhow!("Invalid IPFS URI")) + if let Some(captures) = re + .captures(&path.unwrap_or_default()) + .or_else(|| redir_re.captures(&modified_uri)) + { + return Self::format_capture(captures, ipfs_prefix); } + Err(anyhow::anyhow!("Invalid IPFS URI")) + } + + /// Formats a capture group into a URI. + fn format_capture(captures: Captures<'_>, ipfs_prefix: String) -> anyhow::Result { + let cid = captures["cid"].to_string(); + let path = captures.name("path").map(|m| m.as_str().to_string()); + + PARSE_URI_TYPE_COUNT.with_label_values(&["ipfs"]).inc(); + Ok(format!( + "{}{}{}", + ipfs_prefix, + cid, + path.unwrap_or_default() + )) } } @@ -84,6 +94,11 @@ mod tests { URIParser::parse(IPFS_PREFIX.to_string(), test_public_gateway_uri_no_path).unwrap(); assert_eq!(parsed_uri, format!("{}{}/{}", IPFS_PREFIX, CID, "")); + // Some submitted URIs are in the redirected format + let test_ipfs_redirect = format!("https://{}.ipfs.re.dir.io/{}", CID, PATH); + let parsed_uri = URIParser::parse(IPFS_PREFIX.to_string(), test_ipfs_redirect).unwrap(); + assert_eq!(parsed_uri, format!("{IPFS_PREFIX}{CID}/{PATH}")); + // Public gateway URIs must contain a CID, expect error here let test_public_gateway_uri_no_cid = format!("https://ipfs.io/ipfs/{}/{}", "", PATH); let parsed_uri = URIParser::parse(IPFS_PREFIX.to_string(), test_public_gateway_uri_no_cid);