feat: Return "scraping attempts" from JS symbolication (#1311)

getsentry · Oct 2, 2023 · 2bde872 · 2bde872
1 parent 3820fae
commit 2bde872
Show file tree

Hide file tree

Showing 18 changed files with 283 additions and 40 deletions.
diff --git a/crates/symbolicator-service/src/services/sourcemap_lookup.rs b/crates/symbolicator-service/src/services/sourcemap_lookup.rs
@@ -52,7 +52,7 @@ use crate::caching::{
 use crate::services::download::DownloadService;
 use crate::services::objects::ObjectMetaHandle;
 use crate::services::symbolication::ScrapingConfig;
-use crate::types::{JsStacktrace, ResolvedWith, Scope};
+use crate::types::{JsScrapingAttempt, JsScrapingFailureReason, JsStacktrace, ResolvedWith, Scope};
 use crate::utils::http::is_valid_origin;
 
 use crate::js::JsMetrics;
@@ -227,6 +227,8 @@ impl SourceMapLookup {
             used_artifact_bundles: Default::default(),
 
             metrics: Default::default(),
+
+            scraping_attempts: Default::default(),
         };
 
         Self {
@@ -307,8 +309,13 @@ impl SourceMapLookup {
         self.fetcher.record_metrics();
     }
 
-    pub fn into_used_artifact_bundles(self) -> HashSet<SentryFileId> {
-        self.fetcher.used_artifact_bundles
+    /// Consumes `self` and returns the artifact bundles that were used and
+    /// the scraping attempts that were made.
+    pub fn into_records(self) -> (HashSet<SentryFileId>, Vec<JsScrapingAttempt>) {
+        (
+            self.fetcher.used_artifact_bundles,
+            self.fetcher.scraping_attempts,
+        )
     }
 }
 
@@ -631,6 +638,8 @@ struct ArtifactFetcher {
     individual_artifacts: HashMap<String, IndividualArtifact>,
 
     used_artifact_bundles: HashSet<SentryFileId>,
+
+    scraping_attempts: Vec<JsScrapingAttempt>,
 }
 
 impl ArtifactFetcher {
@@ -714,30 +723,36 @@ impl ArtifactFetcher {
     /// (because multiple files can share one [`DebugId`]).
     #[tracing::instrument(skip(self))]
     pub async fn get_file(&mut self, key: &FileKey) -> CachedFileEntry {
-        if let Some(file) = self.try_get_file_using_index(key).await {
-            return file;
-        }
+        let mut file = self.try_get_file_using_index(key).await;
 
-        // Try looking up the file in one of the artifact bundles that we know about.
-        if let Some(file) = self.try_get_file_from_bundles(key) {
-            return file;
+        if file.is_none() {
+            // Try looking up the file in one of the artifact bundles that we know about.
+            file = self.try_get_file_from_bundles(key);
         }
 
-        // Otherwise, try to get the file from an individual artifact.
-        if let Some(file) = self.try_fetch_file_from_artifacts(key).await {
-            return file;
+        if file.is_none() {
+            // Otherwise, try to get the file from an individual artifact.
+            file = self.try_fetch_file_from_artifacts(key).await;
         }
 
-        // Otherwise: Do a (cached) API lookup for the `abs_path` + `DebugId`
-        if self.query_sentry_for_file(key).await {
-            // At this point, *one* of our known artifacts includes the file we are looking for.
-            // So we do the whole dance yet again.
-            if let Some(file) = self.try_get_file_from_bundles(key) {
-                return file;
+        if file.is_none() {
+            // Otherwise: Do a (cached) API lookup for the `abs_path` + `DebugId`
+            if self.query_sentry_for_file(key).await {
+                // At this point, *one* of our known artifacts includes the file we are looking for.
+                // So we do the whole dance yet again.
+                file = self.try_get_file_from_bundles(key);
+                if file.is_none() {
+                    file = self.try_fetch_file_from_artifacts(key).await;
+                }
             }
-            if let Some(file) = self.try_fetch_file_from_artifacts(key).await {
-                return file;
+        }
+
+        if let Some(file) = file {
+            if let Some(url) = key.abs_path() {
+                self.scraping_attempts
+                    .push(JsScrapingAttempt::not_attempted(url.to_owned()));
             }
+            return file;
         }
 
         // Otherwise, fall back to scraping from the Web.
@@ -879,28 +894,53 @@ impl ArtifactFetcher {
         };
 
         if !self.scraping.enabled {
+            self.scraping_attempts.push(JsScrapingAttempt::failure(
+                abs_path.to_owned(),
+                JsScrapingFailureReason::Disabled,
+                String::new(),
+            ));
             return make_error("Scraping disabled".to_string());
         }
 
         // Only scrape from http sources
         let scheme = url.scheme();
         if !["http", "https"].contains(&scheme) {
+            self.scraping_attempts.push(JsScrapingAttempt::failure(
+                abs_path.to_owned(),
+                JsScrapingFailureReason::InvalidHost,
+                format!("`{scheme}` is not an allowed download scheme"),
+            ));
             return make_error(format!("`{scheme}` is not an allowed download scheme"));
         }
 
         if !is_valid_origin(&url, &self.scraping.allowed_origins) {
+            self.scraping_attempts.push(JsScrapingAttempt::failure(
+                abs_path.to_owned(),
+                JsScrapingFailureReason::InvalidHost,
+                format!("{abs_path} is not an allowed download origin"),
+            ));
             return make_error(format!("{abs_path} is not an allowed download origin"));
         }
 
         let host_string = match url.host_str() {
             None => {
+                self.scraping_attempts.push(JsScrapingAttempt::failure(
+                    abs_path.to_owned(),
+                    JsScrapingFailureReason::InvalidHost,
+                    String::new(),
+                ));
                 return make_error("Invalid host".to_string());
             }
             Some(host @ ("localhost" | "127.0.0.1")) => {
                 // NOTE: the reserved IPs cover a lot more than just localhost.
                 if self.download_svc.can_connect_to_reserved_ips() {
                     host
                 } else {
+                    self.scraping_attempts.push(JsScrapingAttempt::failure(
+                        abs_path.to_owned(),
+                        JsScrapingFailureReason::InvalidHost,
+                        format!("Can't connect to restricted host {host}"),
+                    ));
                     return make_error("Invalid host".to_string());
                 }
             }
@@ -947,21 +987,35 @@ impl ArtifactFetcher {
             .await;
 
         transaction.finish();
-        CachedFileEntry {
-            uri,
-            entry: scraped_file.map(|contents| {
+        let entry = match scraped_file {
+            Ok(contents) => {
                 self.metrics.record_file_scraped(key.as_type());
                 tracing::trace!(?key, "Found file by scraping the web");
 
                 let sourcemap_url = discover_sourcemaps_location(&contents)
                     .and_then(|sm_ref| SourceMapUrl::parse_with_prefix(abs_path, sm_ref).ok())
                     .map(Arc::new);
 
-                CachedFile {
+                self.scraping_attempts
+                    .push(JsScrapingAttempt::success(abs_path.to_owned()));
+
+                Ok(CachedFile {
                     contents,
                     sourcemap_url,
-                }
-            }),
+                })
+            }
+            Err(e) => {
+                self.scraping_attempts.push(JsScrapingAttempt {
+                    url: abs_path.to_owned(),
+                    result: e.clone().into(),
+                });
+
+                Err(e)
+            }
+        };
+        CachedFileEntry {
+            uri,
+            entry,
             resolved_with: ResolvedWith::Scraping,
         }
     }

diff --git a/crates/symbolicator-service/src/services/symbolication/js.rs b/crates/symbolicator-service/src/services/symbolication/js.rs
@@ -125,13 +125,14 @@ impl SymbolicationActor {
         metric!(time_raw("js.unsymbolicated_frames") = unsymbolicated_frames);
         metric!(time_raw("js.missing_sourcescontent") = missing_sourcescontent);
 
-        let used_artifact_bundles = lookup.into_used_artifact_bundles();
+        let (used_artifact_bundles, scraping_attempts) = lookup.into_records();
 
         Ok(CompletedJsSymbolicationResponse {
             stacktraces,
             raw_stacktraces,
             errors: errors.into_iter().collect(),
             used_artifact_bundles,
+            scraping_attempts,
         })
     }
 }

diff --git a/crates/symbolicator-service/src/types/mod.rs b/crates/symbolicator-service/src/types/mod.rs
@@ -13,6 +13,7 @@ use serde::{Deserialize, Serialize};
 use symbolic::common::{Arch, CodeId, DebugId, Language};
 use symbolicator_sources::{ObjectType, SentryFileId};
 
+use crate::caching::CacheError;
 use crate::utils::addr::AddrMode;
 use crate::utils::hex::HexValue;
 
@@ -628,6 +629,101 @@ pub struct JsModuleError {
     pub kind: JsModuleErrorKind,
 }
 
+/// An attempt to scrape a JS source or sourcemap file from the web.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct JsScrapingAttempt {
+    /// The URL we attempted to scrape from.
+    pub url: String,
+    /// The outcome of the attempt.
+    #[serde(flatten)]
+    pub result: JsScrapingResult,
+}
+
+impl JsScrapingAttempt {
+    pub fn success(url: String) -> Self {
+        Self {
+            url,
+            result: JsScrapingResult::Success,
+        }
+    }
+    pub fn not_attempted(url: String) -> Self {
+        Self {
+            url,
+            result: JsScrapingResult::NotAttempted,
+        }
+    }
+
+    pub fn failure(url: String, reason: JsScrapingFailureReason, details: String) -> Self {
+        Self {
+            url,
+            result: JsScrapingResult::Failure { reason, details },
+        }
+    }
+}
+
+/// The outcome of a scraping attempt.
+#[derive(Debug, Clone, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+#[serde(tag = "status")]
+pub enum JsScrapingResult {
+    /// We didn't actually attempt scraping because we already obtained the file
+    /// by another method.
+    NotAttempted,
+    /// The file was succesfully scraped.
+    Success,
+    /// The file couldn't be scraped.
+    Failure {
+        /// The basic reason for the failure.
+        reason: JsScrapingFailureReason,
+        #[serde(skip_serializing_if = "String::is_empty")]
+        /// A more detailed explanation of the failure.
+        details: String,
+    },
+}
+
+impl From<CacheError> for JsScrapingResult {
+    fn from(value: CacheError) -> Self {
+        let (reason, details) = match value {
+            CacheError::NotFound => (JsScrapingFailureReason::NotFound, String::new()),
+            CacheError::PermissionDenied(details) => {
+                (JsScrapingFailureReason::PermissionDenied, details)
+            }
+            CacheError::Timeout(duration) => (
+                JsScrapingFailureReason::Timeout,
+                format!("Timeout after {}", humantime::format_duration(duration)),
+            ),
+            CacheError::DownloadError(details) => (JsScrapingFailureReason::DownloadError, details),
+            CacheError::Malformed(details) => (JsScrapingFailureReason::Other, details),
+            CacheError::InternalError => (JsScrapingFailureReason::Other, String::new()),
+        };
+
+        Self::Failure { reason, details }
+    }
+}
+
+/// The basic reason a scraping attempt failed.
+#[derive(Debug, Clone, Copy, Deserialize, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum JsScrapingFailureReason {
+    /// The file was not found at the given URL.
+    NotFound,
+    /// Scraping was disabled.
+    Disabled,
+    /// The URL was not in the list of allowed hosts or had
+    /// an invalid scheme.
+    InvalidHost,
+    /// Permission to access the file was denied.
+    PermissionDenied,
+    /// The scraping attempt timed out.
+    Timeout,
+    /// There was a non-timeout error while downloading.
+    DownloadError,
+    /// Catchall case.
+    ///
+    /// This probably can't actually happen.
+    Other,
+}
+
 #[derive(Debug, Default, Clone, Deserialize, Serialize)]
 pub struct CompletedJsSymbolicationResponse {
     pub stacktraces: Vec<JsStacktrace>,
@@ -636,6 +732,8 @@ pub struct CompletedJsSymbolicationResponse {
     pub errors: Vec<JsModuleError>,
     #[serde(skip_serializing_if = "HashSet::is_empty")]
     pub used_artifact_bundles: HashSet<SentryFileId>,
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub scraping_attempts: Vec<JsScrapingAttempt>,
 }
 
 /// Information about the operating system.

diff --git a/...ymbolicator-service/tests/integration/snapshots/integration__sourcemap__bundle_index.snap b/...ymbolicator-service/tests/integration/snapshots/integration__sourcemap__bundle_index.snap
@@ -37,4 +37,9 @@ raw_stacktraces:
           - "//# sourceMappingURL=test.min.js.map"
 used_artifact_bundles:
   - 02_correct.zip
+scraping_attempts:
+  - url: "http://example.com/test.min.js"
+    status: not_attempted
+  - url: "http://example.com/test.min.js.map"
+    status: not_attempted
 
diff --git a/...rvice/tests/integration/snapshots/integration__sourcemap__e2e_multiple_smref_scraped.snap b/...rvice/tests/integration/snapshots/integration__sourcemap__e2e_multiple_smref_scraped.snap
@@ -1,6 +1,6 @@
 ---
 source: crates/symbolicator-service/tests/integration/sourcemap.rs
-assertion_line: 595
+assertion_line: 636
 expression: response.unwrap()
 ---
 stacktraces:
@@ -39,4 +39,9 @@ raw_stacktraces:
           - "//# sourceMappingURL=a.different.map"
           - ""
           - //@ sourceMappingURL=another.different.map
+scraping_attempts:
+  - url: "http://localhost:<port>/files/app.js"
+    status: success
+  - url: "http://localhost:<port>/files/app.js.map"
+    status: success
 
diff --git a/...licator-service/tests/integration/snapshots/integration__sourcemap__e2e_node_debugid.snap b/...licator-service/tests/integration/snapshots/integration__sourcemap__e2e_node_debugid.snap
@@ -1,6 +1,6 @@
 ---
 source: crates/symbolicator-service/tests/integration/sourcemap.rs
-assertion_line: 508
+assertion_line: 549
 expression: response.unwrap()
 ---
 stacktraces:
@@ -40,4 +40,9 @@ raw_stacktraces:
           - exports.main = main;
           - "//# sourceMappingURL=entrypoint1.js.map"
           - ""
+scraping_attempts:
+  - url: /Users/lucaforstner/code/github/getsentry/sentry-javascript-bundler-plugins/packages/playground/öut path/rollup/entrypoint1.js
+    status: not_attempted
+  - url: /Users/lucaforstner/code/github/getsentry/sentry-javascript-bundler-plugins/packages/playground/öut path/rollup/entrypoint1.js.map
+    status: not_attempted
 
diff --git a/...licator-service/tests/integration/snapshots/integration__sourcemap__e2e_react_native.snap b/...licator-service/tests/integration/snapshots/integration__sourcemap__e2e_react_native.snap
@@ -1,6 +1,6 @@
 ---
 source: crates/symbolicator-service/tests/integration/sourcemap.rs
-assertion_line: 572
+assertion_line: 613
 expression: response.unwrap()
 ---
 stacktraces:
@@ -26,4 +26,9 @@ raw_stacktraces:
       - abs_path: "app:///index.android.bundle"
         lineno: 1
         colno: 11940
+scraping_attempts:
+  - url: "app:///index.android.bundle"
+    status: not_attempted
+  - url: "app:///index.android.bundle.map"
+    status: not_attempted